Python addUrl Exemples, html_parser.base_paser.addUrl Python Exemples

Exemple #1

0

Afficher le fichier

def parseVideoInfo(sourceUrl, websiteId):
    log.debug('取视频信息 %s' % sourceUrl)

    html = tools.getHtml(sourceUrl, 'gbk')
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = 'class="time">(.*?)<.*?href="(http.*?)".*?title="(.*?)".*?播放：(.*?)<.*?发布：(.*?)<'
    infos = tools.getInfo(html, regex)

    for info in infos:
        length = info[0]
        url = info[1]
        videoName = info[2]
        playCount = info[3]
        releaseTime = info[4]

        log.debug('url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s' %
                  (url, videoName, releaseTime, length, playCount))

        basePaser.addUrl(url, websiteId, VIDEO_ABSTRACT)
        basePaser.addDocumentary(websiteId, videoName, '', url, '', playCount,
                                 length, releaseTime)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #2

0

Afficher le fichier

Fichier : tudou.py Projet : sx1616039/crawl

def parseItermInfo(sourceUrl, websiteId):
    print(websiteId)
    log.debug('解析栏目信息 %s'%sourceUrl)

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    json = tools.getJson(html)
    jsonArray = json['data']
    # 当没有数据时（到最后一页）  jsonArray 为[]
    #添加下一页的url
    if jsonArray != []:
        currentPageRegex = 'page=(\d*?)&'
        currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0]
        nextPage = int(currentPage) + 1
        nextPageUrl = sourceUrl.replace('page=%s'%currentPage, 'page=%d'%nextPage)
        log.debug('nextPageUrl = %s'%nextPageUrl)
        # 添加到urls表 depth为0
        basePaser.addUrl(nextPageUrl, websiteId, ITERM_JSON, Constance.ITERM)

    for info in jsonArray:
        title = info['name']
        url = info['playUrl']
        releaseTime = info['createdTime']
        itemsCount = str(info['itemsCount'])
        log.debug('视频：%s 发布时间：%s 集数：%s url: %s'%(title, releaseTime, itemsCount, url))

        basePaser.addUrl(url, websiteId, ITERM_URL, Constance.ITERM)
        basePaser.addDocumentary(websiteId, title, '', url, itemsCount, '', '', releaseTime)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #3

0

Afficher le fichier

Fichier : tudou.py Projet : sx1616039/crawl

def parseEpisodeUrl(sourceUrl, websiteId):
    log.debug('取剧集url %s'%sourceUrl)

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = '"playUrl":"(.*?)"'
    urls = tools.getInfo(html, regex, True)

    for url in urls:
        log.debug("剧集url: %s"%url)
        basePaser.addUrl(url, websiteId, EPISODE_DESCRIBE, Constance.EPISODE)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

    #添加下一页的url
    if urls != []:
        currentPageRegex = 'pageNo=(\d*?)&'
        currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0]
        nextPage = int(currentPage) + 1
        nextPageUrl = sourceUrl.replace('pageNo=%s'%currentPage, 'pageNo=%d'%nextPage)
        log.debug('nextPageUrl = %s'%nextPageUrl)
        # 添加到urls表 depth为0
        basePaser.addUrl(nextPageUrl, websiteId, EPISODE_URL, Constance.EPISODE)

Exemple #4

0

Afficher le fichier

Fichier : xinhua.py Projet : zhongyinhei/spider-text-message

def parseUrl(urlInfo):
    log.debug('处理 %s'%urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    html = tools.getHtml(sourceUrl)
    if not DEBUG:
        if html == None:
            basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
            return

        regex = '[\u4e00-\u9fa5]+'
        chineseWord = tools.getInfo(html, regex)
        if not chineseWord:
            basePaser.updateUrl(sourceUrl, Constance.DONE)
            return

        # 取当前页面的全部url
        urls = tools.getUrls(html)

        # 过滤掉外链接 添加到数据库
        fitUrl = tools.fitUrl(urls, ['news.cn', 'xinhuanet.com'])
        for url in fitUrl:
            # log.debug('url = ' + url)
            basePaser.addUrl(url, websiteId, depth + 1)


    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)
    # 内容
    regexs = ['<div id="content">(.*?)<div class="clear"></div>',
              '<div class="article">(.*?)<!--文章操作-->',
              '<div id="videoArea">(.*?)<!--文章操作-->',
              '<div class="content">(.*?)<div id="articleEdit">'
             ]

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''
    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             '''%(depth, sourceUrl, title, content))

    if not DEBUG:
        if content and title:
            basePaser.addTextInfo(websiteId, sourceUrl, title, content)

        # 更新sourceUrl为done
        basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #5

0

Afficher le fichier

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    html = tools.getHtml(sourceUrl, 'gb2312')
    if not DEBUG:
        if html == None:
            basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
            return

        regex = '[\u4e00-\u9fa5]+'
        chineseWord = tools.getInfo(html, regex)
        if not chineseWord:
            basePaser.updateUrl(sourceUrl, Constance.DONE)
            return

        # 取当前页面的全部url
        urls = tools.getUrls(html)

        # 过滤掉外链接 添加到数据库
        fitUrl = tools.fitUrl(urls, "163.com")
        for url in fitUrl:
            # log.debug('url = ' + url)
            basePaser.addUrl(url, websiteId, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.replaceStr(title, '&.*?;')
    # 内容
    regexs = [
        '<div id="endText".*?>(.*?)<div class="post_btmshare">',
        '<div class="post_text".*?>(.*?)<div class="post_btmshare">'
    ]

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''
    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             ''' % (depth, sourceUrl, title, content))

    if not DEBUG:
        if content and title:
            basePaser.addTextInfo(websiteId, sourceUrl, title, content)

        # 更新sourceUrl为done
        basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #6

0

Afficher le fichier

def parseUrl(urlInfo):
    log.debug('处理 %s'%urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chineseWord = tools.getInfo(html, regex)
    if not chineseWord:
        basePaser.updateUrl(sourceUrl, Constance.DONE)
        return

    # 取当前页面的全部url
    urls = tools.getUrls(html)

    # 过滤掉外链接 添加到数据库
    fitUrl = tools.fitUrl(urls, "feng.com")
    for url in fitUrl:
        # log.debug('url = ' + url)
        basePaser.addUrl(url, websiteId, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)
    # 内容
    regexs = ['<div id="main_content".*?>(.*?)</div>',
              '<div class="yc_con_l">(.*?)<div class="txt_share_box"',
              '<div id="slideNoInsertDefault"></div>(.*?)</div>']

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''

    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             '''%(depth, sourceUrl, title, content))

    if content:
        basePaser.addTextInfo(websiteId, sourceUrl, title, content)

    # 更新sourceUrl为done
    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #7

0

Afficher le fichier

def parseUrl(urlInfo):
    log.debug('处理 %s'%urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chineseWord = tools.getInfo(html, regex)
    if not chineseWord:
        basePaser.updateUrl(sourceUrl, Constance.DONE)
        return

    # 取当前页面的全部url
    urls = tools.getUrls(html)

    # 过滤掉外链接 添加到数据库
    fitUrl = tools.fitUrl(urls, "cctv.com")
    for url in fitUrl:
        # log.debug('url = ' + url)
        basePaser.addUrl(url, websiteId, depth + 1)


    # 取当前页的文章信息
    # 标题
    regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)
    # 内容
    regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->']

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''

    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             '''%(depth, sourceUrl, title, content))

    if content and title:
        basePaser.addTextInfo(websiteId, sourceUrl, title, content)

    # 更新sourceUrl为done
    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #8

0

Afficher le fichier

Fichier : sohu.py Projet : zhongyinhei/spider-text-message

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    html = tools.getHtml(sourceUrl, 'gb2312')
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
        return

    # 取当前页面的全部url
    urls = tools.getUrls(html)

    # 过滤掉外链接 添加到数据库
    fitUrl = tools.fitUrl(urls, "sohu.com")
    for url in fitUrl:
        # log.debug('url = ' + url)
        basePaser.addUrl(url, websiteId, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)
    # 内容
    regexs = [
        '<div class="content clear clearfix".*?>(.*?)</div>',
        '<div class="box_con".*?>(.*?)<div class="edit clearfix"',
        '<div class="show_text">(.*?)</div>',
        '<div class="text">.*?<hr class="nonehr">',
        '<div itemprop="articleBody">(.*?)<div style="display:none;">',
        '<article>(.*?)</article>'
    ]

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''
    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             ''' % (depth, sourceUrl, title, content))

    if content:
        basePaser.addTextInfo(websiteId, sourceUrl, title, content)

    # 更新sourceUrl为done
    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #9

0

Afficher le fichier

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    websiteId = urlInfo['website_id']

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    # print(html)
    regex = "\('(.*?)'\)"
    jsonStr = tools.getInfo(html, regex)[0]
    # 去掉多余的反斜杠
    jsonStr = jsonStr.replace('\\\\', '~~~')
    jsonStr = jsonStr.replace('\\', '')
    jsonStr = jsonStr.replace('~~~', '\\')

    # log.debug(u'%s'%jsonStr)
    json = tools.getJson(jsonStr)
    jsonArray = json['result']['data']['items']

    if jsonArray != None:
        # 添加下一页的url
        currentPageRegex = 'page=(\d*?)&'
        currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0]
        nextPage = int(currentPage) + 1
        nextPageUrl = sourceUrl.replace('page=%s' % currentPage,
                                        'page=%d' % nextPage)
        log.debug('nextPageUrl = %s' % nextPageUrl)
        # 添加到urls表 depth为0
        basePaser.addUrl(nextPageUrl, websiteId, 0)

        #取当前页的信息
        for info in jsonArray:
            url = info['url']
            videoName = info['title']
            releaseTime = info['create_time']
            source = info['source']
            abstract = info['intro']
            length = info['duration']
            playtimes = info['pv']

            log.debug(
                'url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s\n来源 : %s\n简介 : %s'
                % (url, videoName, releaseTime, length, playtimes, source,
                   abstract))
            basePaser.addDocumentary(websiteId, videoName, abstract, url, '',
                                     playtimes, length, releaseTime, source)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #10

0

Afficher le fichier

def parseInfo(sourceUrl):
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    # print(html)
    regex = "\('(.*?)'\)"
    jsonStr = tools.getInfo(html, regex)[0]
    # 去掉多余的反斜杠
    jsonStr = jsonStr.replace('\\\\', '~~~')
    jsonStr = jsonStr.replace('\\', '')
    jsonStr = jsonStr.replace('~~~', '\\')

    # log.debug(u'%s'%jsonStr)
    json = tools.getJson(jsonStr)
    jsonArray = json['result']['data']['items']

    if jsonArray != None:
        # 添加下一页的url
        currentPageRegex = 'page=(\d*?)&'
        currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0]
        nextPage = int(currentPage) + 1
        nextPageUrl = sourceUrl.replace('page=%s' % currentPage,
                                        'page=%d' % nextPage)
        log.debug('nextPageUrl = %s' % nextPageUrl)
        # 添加到urls表 depth为0
        basePaser.addUrl(nextPageUrl, websiteId, 0)

        #取当前页的信息
        for info in jsonArray:
            url = info['url']
            videoName = info['title']
            releaseTime = info['create_time']
            source = info['source']
            abstract = info['intro']
            length = info['duration']
            playtimes = info['pv']

            log.debug(
                'url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s\n来源 : %s\n简介 : %s'
                % (url, videoName, releaseTime, length, playtimes, source,
                   abstract))
            basePaser.addDocumentary(websiteId, videoName, abstract, url, '',
                                     playtimes, length, releaseTime, source)

    basePaser.updateUrl(sourceUrl, Constance.DONE)


# url = 'http://api.v1.cn/v1Enhanced/interfaceForJsonP?callback=jQuery18308286485691806487_1477619118750&obj=cms.getArticle&cid=1147&page=1&nums=24&_=1477619416282'
# parseInfo(url)

Exemple #11

0

Afficher le fichier

Fichier : tencent.py Projet : sx1616039/crawl

def parseRootUrl(sourceUrl, websiteId, depth):
    #html = tools.getHtml(sourceUrl)
    h = httplib.Http()
    resp, content = h.request(sourceUrl)
    html = content.decode('utf-8', 'ignore')

    regexs = 'data-trigger-class="list_item_hover">.+?href="(.+?)"'
    urls = tools.getInfo(html, regexs)

    for url in urls:
        log.debug("保存视频url到DB: %s" % url)
        basePaser.addUrl(url, websiteId, depth + 1, '')

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #12

0

Afficher le fichier

Fichier : tudou.py Projet : sx1616039/crawl

def parseEpisodeDescribeUrl(sourceUrl, websiteId):
    log.debug('取剧集简介 url ' + sourceUrl)
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = 'videoKw.*?href="(.*?)"'
    urls = tools.getInfo(html, regex)
    for url in urls:
        log.debug("剧集简介url: %s"%url)
        basePaser.addUrl(url, websiteId, EPISODE_INFO, Constance.EPISODE)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #13

0

Afficher le fichier

def parseShowDescribeUrl(sourceUrl, websiteId):
    log.debug('取节目简介 url ' + sourceUrl)
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regexs = 'class="desc-link".*?href="(.+?)"'
    urls = tools.getInfo(html, regexs)
    for url in urls:
        log.debug("节目简介url: %s" % url)
        basePaser.addUrl(url, websiteId, SHOW_INFO, 'show')

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #14

0

Afficher le fichier

def parseShowDescribeUrl(sourceUrl, websiteId):
    log.debug('取节目简介 url ' + sourceUrl)
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = 'movieTitle.*?href="(.*?)"'
    urls = tools.getInfo(html, regex)
    for url in urls:
        log.debug("节目详情url: %s" % url)
        basePaser.addUrl(url, websiteId, SHOW_INFO)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #15

0

Afficher le fichier

def parseRootUrl(sourceUrl, websiteId, depth):
    log.debug('解析 RootNode url = %s begin...'%sourceUrl)

    html = tools.getHtml(sourceUrl)

    reg = '<ul.*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?</ul>'

    urlss = tools.getInfo(html, reg)

    for urls in urlss:
        for url in urls:
            log.debug("保存视频url到DB: %s"%url)
            basePaser.addUrl(url, websiteId, depth + 1, '')

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #16

0

Afficher le fichier

def parseShowUrl(sourceUrl, websiteId):
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regTypeId = basePaser.getRegexTypeId(Constance.VIDEO_URL)
    regexs = basePaser.getRegex(websiteId, regTypeId)
    urls = tools.getInfo(html, regexs)

    for url in urls:
        log.debug("节目url: %s" % url)
        basePaser.addUrl(url, websiteId, SHOW_DESCRIBE, 'show')

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #17

0

Afficher le fichier

def parseShowUrl(sourceUrl, websiteId):
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = 'movielist_tt.*?href="(.*?)"'
    urls = tools.getInfo(html, regex)

    for url in urls:
        log.debug("节目url: %s" % url)
        if url.endswith('.shtml'):
            basePaser.addUrl(url, websiteId, SHOW_DESCRIBE)
        else:
            basePaser.addUrl(url, websiteId, SHOW_INFO)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #18

0

Afficher le fichier

Fichier : tudou.py Projet : sx1616039/crawl

def parseVideoInfo(sourceUrl, websiteId):
    log.debug('解析视频信息 %s'%sourceUrl)

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    json = tools.getJson(html)
    jsonArray = json['data']

    # 当没有数据时（到最后一页）  jsonArray 为[]
    #添加下一页的url
    if jsonArray != []:
        currentPageRegex = 'page=(\d*?)&'
        currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0]
        nextPage = int(currentPage) + 1
        nextPageUrl = sourceUrl.replace('page=%s'%currentPage, 'page=%d'%nextPage)
        log.debug('nextPageUrl = %s'%nextPageUrl)
        # 添加到urls表 depth为0
        basePaser.addUrl(nextPageUrl, websiteId, VIDEO_JSON, Constance.VIDEO)

    # 解析当前页的信息
    for info in jsonArray:
        title = info['title']
        playTimes = str(info['playTimes'])
        pubDate = info['pubDate']
        totalTimeStr = info['totalTimeStr']
        urlCode = info['code']
        url = 'http://www.tudou.com/programs/view/%s/'%urlCode
        log.debug('视频：%s 播放次数：%s 发布时间：%s 总时长：%s url: %s'%(title, playTimes, pubDate, totalTimeStr, url))

        # # 进入url  取简介
        # videoHtml = tools.getHtml(url)
        # regex = 'class="v_desc">(.*?)</p>'
        # abstract = tools.getInfo(videoHtml, regex)
        # abstract = len(abstract) > 0 and abstract[0] or ''
        # # abstract = tools.replaceStr(abstract, '<.*?>')
        # log.debug('简介: %s\n'%abstract)
        basePaser.addUrl(url, websiteId, VIDEO_URL, Constance.VIDEO)
        basePaser.addDocumentary(websiteId, title, '', url, '', playTimes, totalTimeStr, pubDate)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #19

0

Afficher le fichier

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    # 使用urlopen网页有时乱码 用get请求 然后设置编码解决了问题
    html = tools.getHtmlByGet(sourceUrl)

    if not DEBUG:
        if html == None:
            if sourceUrl == Constance.SINA:
                basePaser.updateUrl(sourceUrl, Constance.TODO)
            else:
                basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
            return

        regex = '[\u4e00-\u9fa5]+'
        chineseWord = tools.getInfo(html, regex)
        if not chineseWord:
            basePaser.updateUrl(sourceUrl, Constance.DONE)
            return

        # 取当前页面的全部url
        urls = tools.getUrls(html)

        # 过滤掉外链接 添加到数据库
        fitUrl = tools.fitUrl(urls, "sina.com.cn")
        for url in fitUrl:
            # log.debug('url = ' + url)
            basePaser.addUrl(url, websiteId, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)

    if title == '加载中...':
        # 更新sourceUrl为done
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    # 内容
    regexs = [
        'id="artibody".*?>(.*?)<!-- 吸顶导航结束定位标记 -->',
        'id="artibody".*?>(.*?)<div id="left_hzh_ad">',
        '<!-- 正文内容 begin -->(.*?)<!-- 正文内容 end -->',
        'id="articleContent".*?>(.*?)<div class="spacer"></div>'
    ]

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''
    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             ''' % (depth, sourceUrl, title, content))
    if not DEBUG:
        if content and title:
            basePaser.addTextInfo(websiteId, sourceUrl, title, content)

        # 更新sourceUrl为done
        basePaser.updateUrl(sourceUrl, Constance.DONE)

Exemple #20

0

Afficher le fichier

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    # 使用urlopen网页有时乱码 用get请求 然后设置编码解决了问题
    html = tools.getHtmlByGet(sourceUrl, '')

    if not DEBUG:
        if html == None:
            if sourceUrl == Constance.TENCENT:
                basePaser.updateUrl(sourceUrl, Constance.TODO)
            else:
                basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
            return

        regex = '[\u4e00-\u9fa5]+'
        chineseWord = tools.getInfo(html, regex)
        if not chineseWord:
            basePaser.updateUrl(sourceUrl, Constance.DONE)
            return

        # 取当前页面的全部url
        urls = tools.getUrls(html)

        # 过滤掉外链接 添加到数据库
        fitUrl = tools.fitUrl(urls, "qq.com")
        fitUrl = tools.filterRule(fitUrl, lineList)
        for url in fitUrl:
            # log.debug('url = ' + url)
            basePaser.addUrl(url, websiteId, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)
    # 内容
    regexs = [
        'bossZone="content">(.+?)正文已结束.+?</span>',
        'id="articleContent">(.*?)<div class="hasc">'
    ]

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''
    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             ''' % (depth, sourceUrl, title, content))

    if not DEBUG:
        if content and title:
            basePaser.addTextInfo(websiteId, sourceUrl, title, content)

        # 更新sourceUrl为done
        basePaser.updateUrl(sourceUrl, Constance.DONE)