Python updateUrl Beispiele, html_parser.base_paser.updateUrl Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: wangyi.py Projekt: sx1616039/crawl

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    websiteId = urlInfo['website_id']

    html = tools.getHtml(sourceUrl, 'gb2312')
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = '<span class="length">(.*?)</span>.*? href="(.*?)">(.*?)</a>.*?<p>(.*?)</p>'
    infos = tools.getInfo(html, regex)

    for info in infos:
        videoLength = info[0]
        videoUrl = info[1]
        videoName = info[2]
        videoReleaseTime = info[3]
        # 名称中有<span id='video_hl'>纪录片</span>这个信息将其过滤
        rubbishs = tools.getInfo(videoName, '<span.*?</span>')  #查找简介里面的html标签
        for rubbish in rubbishs:
            videoName = videoName.replace(rubbish, "")

        log.debug('\n片名 %s\n发布时间 %s\n时长 %s\nurl %s\n' %
                  (videoName, videoReleaseTime, videoLength, videoUrl))
        basePaser.addDocumentary(websiteId, videoName, '', videoUrl, 1, '',
                                 videoLength, videoReleaseTime)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #2

0

Datei anzeigen

Datei: tudou.py Projekt: sx1616039/crawl

def parseEpisodeUrl(sourceUrl, websiteId):
    log.debug('取剧集url %s'%sourceUrl)

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = '"playUrl":"(.*?)"'
    urls = tools.getInfo(html, regex, True)

    for url in urls:
        log.debug("剧集url: %s"%url)
        basePaser.addUrl(url, websiteId, EPISODE_DESCRIBE, Constance.EPISODE)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

    #添加下一页的url
    if urls != []:
        currentPageRegex = 'pageNo=(\d*?)&'
        currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0]
        nextPage = int(currentPage) + 1
        nextPageUrl = sourceUrl.replace('pageNo=%s'%currentPage, 'pageNo=%d'%nextPage)
        log.debug('nextPageUrl = %s'%nextPageUrl)
        # 添加到urls表 depth为0
        basePaser.addUrl(nextPageUrl, websiteId, EPISODE_URL, Constance.EPISODE)

Beispiel #3

0

Datei anzeigen

def parseVideoInfo(sourceUrl, websiteId):
    log.debug('取视频信息 %s' % sourceUrl)

    html = tools.getHtml(sourceUrl, 'gbk')
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = 'class="time">(.*?)<.*?href="(http.*?)".*?title="(.*?)".*?播放：(.*?)<.*?发布：(.*?)<'
    infos = tools.getInfo(html, regex)

    for info in infos:
        length = info[0]
        url = info[1]
        videoName = info[2]
        playCount = info[3]
        releaseTime = info[4]

        log.debug('url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s' %
                  (url, videoName, releaseTime, length, playCount))

        basePaser.addUrl(url, websiteId, VIDEO_ABSTRACT)
        basePaser.addDocumentary(websiteId, videoName, '', url, '', playCount,
                                 length, releaseTime)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #4

0

Datei anzeigen

Datei: tudou.py Projekt: sx1616039/crawl

def parseItermInfo(sourceUrl, websiteId):
    print(websiteId)
    log.debug('解析栏目信息 %s'%sourceUrl)

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    json = tools.getJson(html)
    jsonArray = json['data']
    # 当没有数据时（到最后一页）  jsonArray 为[]
    #添加下一页的url
    if jsonArray != []:
        currentPageRegex = 'page=(\d*?)&'
        currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0]
        nextPage = int(currentPage) + 1
        nextPageUrl = sourceUrl.replace('page=%s'%currentPage, 'page=%d'%nextPage)
        log.debug('nextPageUrl = %s'%nextPageUrl)
        # 添加到urls表 depth为0
        basePaser.addUrl(nextPageUrl, websiteId, ITERM_JSON, Constance.ITERM)

    for info in jsonArray:
        title = info['name']
        url = info['playUrl']
        releaseTime = info['createdTime']
        itemsCount = str(info['itemsCount'])
        log.debug('视频：%s 发布时间：%s 集数：%s url: %s'%(title, releaseTime, itemsCount, url))

        basePaser.addUrl(url, websiteId, ITERM_URL, Constance.ITERM)
        basePaser.addDocumentary(websiteId, title, '', url, itemsCount, '', '', releaseTime)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #5

0

Datei anzeigen

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    websiteId = urlInfo['website_id']

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    # 匹配带期数的
    regex = 'ui-list-ct.*?href=\'(.*?)\'.*?class="msk-txt">(.*?)<.*?class="main-tt">(.*?)</span>'
    infos = tools.getInfo(html, regex)
    for info in infos:
        print(info)
        videoUrl = info[0]
        videoReleaseTime = info[1]
        videoName = info[2]
        log.debug('\n片名 %s\n发布时间 %s\nnurl %s\n' %
                  (videoName, videoReleaseTime, videoUrl))
        basePaser.addDocumentary(websiteId, videoName, '', videoUrl, '', '',
                                 '', videoReleaseTime)

    print('-' * 40)
    regex = 'ui-list-ct.*?href=\'(.*?)\'.*?class="main-tt">(.*?)</span>'
    infos = tools.getInfo(html, regex)
    for info in infos:
        videoUrl = info[0]
        videoName = info[1]
        log.debug('\n片名 %s\nurl %s\n' % (videoName, videoUrl))
        basePaser.addDocumentary(websiteId, videoName, '', videoUrl, '', '',
                                 '', '', '')

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #6

0

Datei anzeigen

Datei: sohu.py Projekt: zhongyinhei/spider-text-message

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    html = tools.getHtml(sourceUrl, 'gb2312')
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
        return

    # 取当前页面的全部url
    urls = tools.getUrls(html)

    # 过滤掉外链接 添加到数据库
    fitUrl = tools.fitUrl(urls, "sohu.com")
    for url in fitUrl:
        # log.debug('url = ' + url)
        basePaser.addUrl(url, websiteId, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)
    # 内容
    regexs = [
        '<div class="content clear clearfix".*?>(.*?)</div>',
        '<div class="box_con".*?>(.*?)<div class="edit clearfix"',
        '<div class="show_text">(.*?)</div>',
        '<div class="text">.*?<hr class="nonehr">',
        '<div itemprop="articleBody">(.*?)<div style="display:none;">',
        '<article>(.*?)</article>'
    ]

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''
    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             ''' % (depth, sourceUrl, title, content))

    if content:
        basePaser.addTextInfo(websiteId, sourceUrl, title, content)

    # 更新sourceUrl为done
    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #7

0

Datei anzeigen

def parseShowInfo(sourceUrl, websiteId):
    log.debug('解析节目信息%s' % sourceUrl)

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    # 节目名
    regex = "<h1>(.*?)</h1>"
    showName = tools.getInfo(html, regex)
    showName = len(showName) > 0 and showName[0] or ''
    showName = tools.replaceStr(showName, '<.*?>')
    log.debug('片名：%s' % showName)

    # 播放次数
    regex = "播放次数.*?>(.*?)<"
    playCount = tools.getInfo(html, regex)
    playCount = len(playCount) > 0 and playCount[0] or ''
    log.debug('播放次数: %s' % playCount)

    # 发布时间
    regex = '<li>年份.*?>(.*?)<'
    releaseTime = tools.getInfo(html, regex)
    releaseTime = len(releaseTime) > 0 and releaseTime[0] or ''
    log.debug('发布时间: %s' % releaseTime)

    # 集数
    regex = '更新至\s*?(.*?)<'
    episodeNum = tools.getInfo(html, regex)
    episodeNum = len(episodeNum) > 0 and episodeNum[0] or ''
    log.debug('集数: %s' % episodeNum)

    # 片长
    regex = '片长.*?>(.*?)<'
    showLength = tools.getInfo(html, regex)
    showLength = len(showLength) > 0 and showLength[0] or ''
    log.debug('片长: %s' % showLength)

    # 简介
    # 带详情的和不带详情的
    regexs = [
        'intro_cont_all.*?<p>(.*?)<span', 'introduction.*?<p>(.*?)</div>'
    ]
    abstract = tools.getInfo(html, regexs)
    abstract = len(abstract) > 0 and abstract[0] or ''
    abstract = tools.replaceStr(abstract, '<.*?>')
    abstract = tools.replaceStr(abstract, '&ldquo;|&rdquo;')
    log.debug('简介: %s\n' % abstract)

    basePaser.addDocumentary(websiteId, showName, abstract, sourceUrl,
                             episodeNum, playCount, showLength, releaseTime)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #8

0

Datei anzeigen

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    websiteId = urlInfo['website_id']

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    # print(html)
    regex = "\('(.*?)'\)"
    jsonStr = tools.getInfo(html, regex)[0]
    # 去掉多余的反斜杠
    jsonStr = jsonStr.replace('\\\\', '~~~')
    jsonStr = jsonStr.replace('\\', '')
    jsonStr = jsonStr.replace('~~~', '\\')

    # log.debug(u'%s'%jsonStr)
    json = tools.getJson(jsonStr)
    jsonArray = json['result']['data']['items']

    if jsonArray != None:
        # 添加下一页的url
        currentPageRegex = 'page=(\d*?)&'
        currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0]
        nextPage = int(currentPage) + 1
        nextPageUrl = sourceUrl.replace('page=%s' % currentPage,
                                        'page=%d' % nextPage)
        log.debug('nextPageUrl = %s' % nextPageUrl)
        # 添加到urls表 depth为0
        basePaser.addUrl(nextPageUrl, websiteId, 0)

        #取当前页的信息
        for info in jsonArray:
            url = info['url']
            videoName = info['title']
            releaseTime = info['create_time']
            source = info['source']
            abstract = info['intro']
            length = info['duration']
            playtimes = info['pv']

            log.debug(
                'url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s\n来源 : %s\n简介 : %s'
                % (url, videoName, releaseTime, length, playtimes, source,
                   abstract))
            basePaser.addDocumentary(websiteId, videoName, abstract, url, '',
                                     playtimes, length, releaseTime, source)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #9

0

Datei anzeigen

def parseInfo(sourceUrl):
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    # print(html)
    regex = "\('(.*?)'\)"
    jsonStr = tools.getInfo(html, regex)[0]
    # 去掉多余的反斜杠
    jsonStr = jsonStr.replace('\\\\', '~~~')
    jsonStr = jsonStr.replace('\\', '')
    jsonStr = jsonStr.replace('~~~', '\\')

    # log.debug(u'%s'%jsonStr)
    json = tools.getJson(jsonStr)
    jsonArray = json['result']['data']['items']

    if jsonArray != None:
        # 添加下一页的url
        currentPageRegex = 'page=(\d*?)&'
        currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0]
        nextPage = int(currentPage) + 1
        nextPageUrl = sourceUrl.replace('page=%s' % currentPage,
                                        'page=%d' % nextPage)
        log.debug('nextPageUrl = %s' % nextPageUrl)
        # 添加到urls表 depth为0
        basePaser.addUrl(nextPageUrl, websiteId, 0)

        #取当前页的信息
        for info in jsonArray:
            url = info['url']
            videoName = info['title']
            releaseTime = info['create_time']
            source = info['source']
            abstract = info['intro']
            length = info['duration']
            playtimes = info['pv']

            log.debug(
                'url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s\n来源 : %s\n简介 : %s'
                % (url, videoName, releaseTime, length, playtimes, source,
                   abstract))
            basePaser.addDocumentary(websiteId, videoName, abstract, url, '',
                                     playtimes, length, releaseTime, source)

    basePaser.updateUrl(sourceUrl, Constance.DONE)


# url = 'http://api.v1.cn/v1Enhanced/interfaceForJsonP?callback=jQuery18308286485691806487_1477619118750&obj=cms.getArticle&cid=1147&page=1&nums=24&_=1477619416282'
# parseInfo(url)

Beispiel #10

0

Datei anzeigen

def parseShowDescribeUrl(sourceUrl, websiteId):
    log.debug('取节目简介 url ' + sourceUrl)
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = 'movieTitle.*?href="(.*?)"'
    urls = tools.getInfo(html, regex)
    for url in urls:
        log.debug("节目详情url: %s" % url)
        basePaser.addUrl(url, websiteId, SHOW_INFO)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #11

0

Datei anzeigen

def parseShowDescribeUrl(sourceUrl, websiteId):
    log.debug('取节目简介 url ' + sourceUrl)
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regexs = 'class="desc-link".*?href="(.+?)"'
    urls = tools.getInfo(html, regexs)
    for url in urls:
        log.debug("节目简介url: %s" % url)
        basePaser.addUrl(url, websiteId, SHOW_INFO, 'show')

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #12

0

Datei anzeigen

Datei: tudou.py Projekt: sx1616039/crawl

def parseEpisodeDescribeUrl(sourceUrl, websiteId):
    log.debug('取剧集简介 url ' + sourceUrl)
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = 'videoKw.*?href="(.*?)"'
    urls = tools.getInfo(html, regex)
    for url in urls:
        log.debug("剧集简介url: %s"%url)
        basePaser.addUrl(url, websiteId, EPISODE_INFO, Constance.EPISODE)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #13

0

Datei anzeigen

Datei: tencent.py Projekt: sx1616039/crawl

def parseRootUrl(sourceUrl, websiteId, depth):
    #html = tools.getHtml(sourceUrl)
    h = httplib.Http()
    resp, content = h.request(sourceUrl)
    html = content.decode('utf-8', 'ignore')

    regexs = 'data-trigger-class="list_item_hover">.+?href="(.+?)"'
    urls = tools.getInfo(html, regexs)

    for url in urls:
        log.debug("保存视频url到DB: %s" % url)
        basePaser.addUrl(url, websiteId, depth + 1, '')

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #14

0

Datei anzeigen

def parseRootUrl(sourceUrl, websiteId, depth):
    log.debug('解析 RootNode url = %s begin...'%sourceUrl)

    html = tools.getHtml(sourceUrl)

    reg = '<ul.*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?</ul>'

    urlss = tools.getInfo(html, reg)

    for urls in urlss:
        for url in urls:
            log.debug("保存视频url到DB: %s"%url)
            basePaser.addUrl(url, websiteId, depth + 1, '')

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #15

0

Datei anzeigen

def parseShowUrl(sourceUrl, websiteId):
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regTypeId = basePaser.getRegexTypeId(Constance.VIDEO_URL)
    regexs = basePaser.getRegex(websiteId, regTypeId)
    urls = tools.getInfo(html, regexs)

    for url in urls:
        log.debug("节目url: %s" % url)
        basePaser.addUrl(url, websiteId, SHOW_DESCRIBE, 'show')

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #16

0

Datei anzeigen

Datei: tudou.py Projekt: sx1616039/crawl

def parseVideoAbstract(sourceUrl, websiteId):
    # 进入url  取简介
    videoHtml = tools.getHtml(sourceUrl)
    videoHtml = tools.getHtml(sourceUrl)
    if videoHtml == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = 'class="v_desc">(.*?)</p>'
    abstract = tools.getInfo(videoHtml, regex)
    abstract = len(abstract) > 0 and abstract[0] or ''
    # abstract = tools.replaceStr(abstract, '<.*?>')
    log.debug('url: %s\n简介: %s\n'%(sourceUrl, abstract))

    basePaser.addDocumentary(websiteId, '', abstract, sourceUrl)
    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #17

0

Datei anzeigen

def parseVideoAbstract(sourceUrl, websiteId):
    log.debug('取视频 %s' % sourceUrl)

    html = tools.getHtml(sourceUrl, 'gbk')
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = 'class="ckl_neir".*<p>(.*?)</p>'
    abstract = tools.getInfo(html, regex)
    abstract = abstract == [] and '' or abstract[0]
    abstract = abstract.replace('&quot;', '"')
    log.debug("url ：%s\n简介：%s" % (sourceUrl, abstract))

    basePaser.addDocumentary(websiteId, '', abstract, sourceUrl)
    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #18

0

Datei anzeigen

def parseShowUrl(sourceUrl, websiteId):
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = 'movielist_tt.*?href="(.*?)"'
    urls = tools.getInfo(html, regex)

    for url in urls:
        log.debug("节目url: %s" % url)
        if url.endswith('.shtml'):
            basePaser.addUrl(url, websiteId, SHOW_DESCRIBE)
        else:
            basePaser.addUrl(url, websiteId, SHOW_INFO)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #19

0

Datei anzeigen

def parseShowInfo(sourceUrl, websiteId):
    log.debug('解析节目信息%s' % sourceUrl)

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    #片名
    regexs = '<h1 class="title">.*?class="name">(.+?)</span>'
    showName = tools.getInfo(html, regexs)
    showName = len(showName) > 0 and showName[0] or ''
    log.debug('片名：%s' % showName)

    #集数
    regexs = 'class="basenotice">.*?([\d-]+).*?</div>'
    episodeNum = tools.getInfo(html, regexs)
    episodeNum = len(episodeNum) > 0 and episodeNum[0] or ''
    log.debug('集数: %s' % episodeNum)

    #播放量
    regexs = "总播放:.*?>([\d,]+).*?</"
    playCount = tools.getInfo(html, regexs)
    playCount = len(playCount) > 0 and playCount[0] or ''
    log.debug('播放量: %s' % playCount)

    #简介
    regexs = '<div class="detail">(.*?)</div>'
    abstract = tools.getInfo(html, regexs)
    abstract = len(abstract) > 0 and abstract[0] or ''
    rubbishs = tools.getInfo(abstract, '<.*?>')  #查找简介里面的html标签
    #去掉简介中的html标签
    for rubbish in rubbishs:
        abstract = abstract.replace(rubbish, "")

    rubbishs = tools.getInfo(abstract, '\s')  #查找简介里面的空白字符，包括空格、制表符、换页符等等
    #去掉简介中的空白字符，包括空格、制表符、换页符等等
    for rubbish in rubbishs:
        abstract = abstract.replace(rubbish, "")
    log.debug('简介: %s' % abstract)

    basePaser.addDocumentary(websiteId, showName, abstract, sourceUrl,
                             episodeNum, playCount)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #20

0

Datei anzeigen

Datei: tudou.py Projekt: sx1616039/crawl

def parseVideoInfo(sourceUrl, websiteId):
    log.debug('解析视频信息 %s'%sourceUrl)

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    json = tools.getJson(html)
    jsonArray = json['data']

    # 当没有数据时（到最后一页）  jsonArray 为[]
    #添加下一页的url
    if jsonArray != []:
        currentPageRegex = 'page=(\d*?)&'
        currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0]
        nextPage = int(currentPage) + 1
        nextPageUrl = sourceUrl.replace('page=%s'%currentPage, 'page=%d'%nextPage)
        log.debug('nextPageUrl = %s'%nextPageUrl)
        # 添加到urls表 depth为0
        basePaser.addUrl(nextPageUrl, websiteId, VIDEO_JSON, Constance.VIDEO)

    # 解析当前页的信息
    for info in jsonArray:
        title = info['title']
        playTimes = str(info['playTimes'])
        pubDate = info['pubDate']
        totalTimeStr = info['totalTimeStr']
        urlCode = info['code']
        url = 'http://www.tudou.com/programs/view/%s/'%urlCode
        log.debug('视频：%s 播放次数：%s 发布时间：%s 总时长：%s url: %s'%(title, playTimes, pubDate, totalTimeStr, url))

        # # 进入url  取简介
        # videoHtml = tools.getHtml(url)
        # regex = 'class="v_desc">(.*?)</p>'
        # abstract = tools.getInfo(videoHtml, regex)
        # abstract = len(abstract) > 0 and abstract[0] or ''
        # # abstract = tools.replaceStr(abstract, '<.*?>')
        # log.debug('简介: %s\n'%abstract)
        basePaser.addUrl(url, websiteId, VIDEO_URL, Constance.VIDEO)
        basePaser.addDocumentary(websiteId, title, '', url, '', playTimes, totalTimeStr, pubDate)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #21

0

Datei anzeigen

Datei: tudou.py Projekt: sx1616039/crawl

def parseItermAbstract(sourceUrl, websiteId):
    # 进入url  取简介
    html = tools.getHtml(sourceUrl)
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    regex = '<span class="desc">(.*?)</span>'
    abstract = tools.getInfo(html, regex)
    abstract = len(abstract) > 0 and abstract[0] or ''
    # abstract = tools.replaceStr(abstract, '<.*?>')
    log.debug('url: %s\n简介: %s\n'%(sourceUrl, abstract))

    basePaser.addDocumentary(websiteId, '', abstract, sourceUrl)
    basePaser.updateUrl(sourceUrl, Constance.DONE)

# sourceUrl = 'http://www.tudou.com/list/playlistData.action?tagType=2&firstTagId=8&areaCode=&tags=&initials=&hotSingerId=&page=1&sort=2&key='
# parseItermInfo(sourceUrl, '')

Beispiel #22

0

Datei anzeigen

Datei: tudou.py Projekt: sx1616039/crawl

def parseEpisodeInfo(sourceUrl, websiteId):
    log.debug('解析剧集信息%s'%sourceUrl)

    html = tools.getHtml(sourceUrl, 'gbk')
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    # 片名
    regex = 'class="cover_info">.*?title="(.*?)"'
    showName = tools.getInfo(html, regex)
    showName = len(showName) > 0 and showName[0] or ''
    log.debug('片名：%s'%showName)

    # 发布时间
    regex = 'class="first".*?>(.*?)<'
    releaseTime = tools.getInfo(html, regex)
    releaseTime = len(releaseTime) > 0 and releaseTime[0] or ''
    log.debug('发布时间: %s'%releaseTime)

    # 播放量
    regex = 'class="key_item t_1".*?</span>(.*?)</span>'
    playCount = tools.getInfo(html, regex)
    playCount = len(playCount) > 0 and playCount[0] or ''
    log.debug('播放次数: %s'%playCount)

    # 集数
    regex = 'update:\'(.*?)\''
    episodeNum = tools.getInfo(html, regex)
    episodeNum = len(episodeNum) > 0 and episodeNum[0] or ''
    log.debug('集数: %s'%episodeNum)

    # 简介
    regex = 'class=\'desc\'>(.*?)</div>'
    abstract = tools.getInfo(html, regex)
    abstract = len(abstract) > 0 and abstract[0] or ''
    abstract = tools.replaceStr(abstract, '<.*?>')
    log.debug('简介: %s\n'%abstract)

    basePaser.addDocumentary(websiteId, showName, abstract, sourceUrl, episodeNum, playCount, '', releaseTime)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #23

0

Datei anzeigen

def parseVideoInfo(sourceUrl, websiteId):
    log.debug("解析视频 baserul = %s" % sourceUrl)
    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        return

    #片名 播放量
    regexs = 'class="info-list">.*?href="(.+?)".*?title="\s*(.+?)\s*">.*?<li class=" ">\s*(.+?)\s*</li>'
    videosInfo = tools.getInfo(html, regexs)
    for videoInfo in videosInfo:
        videoUrl = videoInfo[0]
        videoName = videoInfo[1]
        videoPlayNum = videoInfo[2]
        log.debug("视频：%s\n播放量：%s\nurl: %s\n" %
                  (videoName, videoPlayNum, videoUrl))
        basePaser.addDocumentary(websiteId, videoName, '', videoUrl, 1,
                                 videoPlayNum)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #24

0

Datei anzeigen

Datei: xinhua.py Projekt: zhongyinhei/spider-text-message

def parseUrl(urlInfo):
    log.debug('处理 %s'%urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    html = tools.getHtml(sourceUrl)
    if not DEBUG:
        if html == None:
            basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
            return

        regex = '[\u4e00-\u9fa5]+'
        chineseWord = tools.getInfo(html, regex)
        if not chineseWord:
            basePaser.updateUrl(sourceUrl, Constance.DONE)
            return

        # 取当前页面的全部url
        urls = tools.getUrls(html)

        # 过滤掉外链接 添加到数据库
        fitUrl = tools.fitUrl(urls, ['news.cn', 'xinhuanet.com'])
        for url in fitUrl:
            # log.debug('url = ' + url)
            basePaser.addUrl(url, websiteId, depth + 1)


    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)
    # 内容
    regexs = ['<div id="content">(.*?)<div class="clear"></div>',
              '<div class="article">(.*?)<!--文章操作-->',
              '<div id="videoArea">(.*?)<!--文章操作-->',
              '<div class="content">(.*?)<div id="articleEdit">'
             ]

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''
    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             '''%(depth, sourceUrl, title, content))

    if not DEBUG:
        if content and title:
            basePaser.addTextInfo(websiteId, sourceUrl, title, content)

        # 更新sourceUrl为done
        basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #25

0

Datei anzeigen

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    html = tools.getHtml(sourceUrl, 'gb2312')
    if not DEBUG:
        if html == None:
            basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
            return

        regex = '[\u4e00-\u9fa5]+'
        chineseWord = tools.getInfo(html, regex)
        if not chineseWord:
            basePaser.updateUrl(sourceUrl, Constance.DONE)
            return

        # 取当前页面的全部url
        urls = tools.getUrls(html)

        # 过滤掉外链接 添加到数据库
        fitUrl = tools.fitUrl(urls, "163.com")
        for url in fitUrl:
            # log.debug('url = ' + url)
            basePaser.addUrl(url, websiteId, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.replaceStr(title, '&.*?;')
    # 内容
    regexs = [
        '<div id="endText".*?>(.*?)<div class="post_btmshare">',
        '<div class="post_text".*?>(.*?)<div class="post_btmshare">'
    ]

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''
    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             ''' % (depth, sourceUrl, title, content))

    if not DEBUG:
        if content and title:
            basePaser.addTextInfo(websiteId, sourceUrl, title, content)

        # 更新sourceUrl为done
        basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #26

0

Datei anzeigen

def parseUrl(urlInfo):
    log.debug('处理 %s'%urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chineseWord = tools.getInfo(html, regex)
    if not chineseWord:
        basePaser.updateUrl(sourceUrl, Constance.DONE)
        return

    # 取当前页面的全部url
    urls = tools.getUrls(html)

    # 过滤掉外链接 添加到数据库
    fitUrl = tools.fitUrl(urls, "feng.com")
    for url in fitUrl:
        # log.debug('url = ' + url)
        basePaser.addUrl(url, websiteId, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)
    # 内容
    regexs = ['<div id="main_content".*?>(.*?)</div>',
              '<div class="yc_con_l">(.*?)<div class="txt_share_box"',
              '<div id="slideNoInsertDefault"></div>(.*?)</div>']

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''

    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             '''%(depth, sourceUrl, title, content))

    if content:
        basePaser.addTextInfo(websiteId, sourceUrl, title, content)

    # 更新sourceUrl为done
    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #27

0

Datei anzeigen

def parseUrl(urlInfo):
    log.debug('处理 %s'%urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    html = tools.getHtml(sourceUrl)
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chineseWord = tools.getInfo(html, regex)
    if not chineseWord:
        basePaser.updateUrl(sourceUrl, Constance.DONE)
        return

    # 取当前页面的全部url
    urls = tools.getUrls(html)

    # 过滤掉外链接 添加到数据库
    fitUrl = tools.fitUrl(urls, "cctv.com")
    for url in fitUrl:
        # log.debug('url = ' + url)
        basePaser.addUrl(url, websiteId, depth + 1)


    # 取当前页的文章信息
    # 标题
    regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)
    # 内容
    regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->']

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''

    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             '''%(depth, sourceUrl, title, content))

    if content and title:
        basePaser.addTextInfo(websiteId, sourceUrl, title, content)

    # 更新sourceUrl为done
    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #28

0

Datei anzeigen

def parseList(sourceUrl, description):
    log.debug("parseList url = %s" % sourceUrl)

    columnId = description

    json = tools.getJsonByRequests(sourceUrl)
    # json = tools.getHtml(sourceUrl, 'utf-8')
    # json = tools.getJson(json)

    if not json:
        basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
        return

    datas = dict(json)['data']
    for data in datas:
        data = tools.getJsonValue(data, 'content')

        title = tools.getJsonValue(data, 'title')

        # 检测数据库中是否存在，若存在则退出
        result = db.app_content_info.find({'title': title})
        if list(result):
            continue

        abstract = tools.getJsonValue(data, 'abstract')
        abstract = abstract and abstract or tools.getJsonValue(data, 'content')

        imgUrl = tools.getJsonValue(data, 'image_list.url')
        imgUrl = imgUrl and imgUrl or tools.getJsonValue(
            data, 'middle_image.url')
        imgUrl = imgUrl and imgUrl or tools.getJsonValue(
            data, 'large_image_list.url')
        imgUrl = imgUrl and imgUrl.replace('.webp', '.jpg') or imgUrl

        originalUrl = tools.getJsonValue(data, 'article_url')
        originalUrl = originalUrl and originalUrl or tools.getJsonValue(
            data, 'share_url')

        releaseTime = tools.getJsonValue(data, 'publish_time')
        releaseTime = releaseTime and releaseTime or tools.getJsonValue(
            data, '1481012423')
        releaseTime = releaseTime and tools.timestampToDate(
            releaseTime) or releaseTime

        videoMsg = tools.getJsonValue(data, 'video_play_info')  #需要处理
        videoMainUrl = tools.getJsonValue(videoMsg,
                                          'video_list.video_2.main_url')
        videoMainUrl = videoMainUrl and videoMainUrl or tools.getJsonValue(
            videoMsg, 'video_list.video_1.main_url')
        parseVideoUrl = tools.compileJs(parseVideoUrlJSFunc)
        videoUrl = parseVideoUrl('base64decode', videoMainUrl)

        html = tools.getHtml(originalUrl)
        regexs = [
            'class="article-content">(.*?)<div class="article-actions">',
            '<div class="content">(.*?)<div class="suggestion-list-con"',
            '<!-- 文章内容 -->(.*?)<!-- @end 文章内容 -->',
            'class="yi-content-text">(.*?)<div class="yi-normal"',
            '<p.*?>(.*?)</p>'
        ]

        if videoUrl:
            content = abstract
        else:
            content = ''.join(tools.getInfo(html, regexs))
            content = tools.delHtmlTag(content)

        if len(content) < len(abstract):
            content = abstract

        # 敏感事件
        sensitive_id = None
        sensitive_event_infos = db.sensitive_event_info.find({})
        for sensitive_event_info in sensitive_event_infos:
            keywords = sensitive_event_info['keyword1']
            keywords = keywords.split(',')
            for keyword in keywords:
                if keyword and (keyword in title or keyword in content):
                    sensitive_id = sensitive_event_info['_id']
                    break

            if sensitive_id:
                break

        # 违规事件
        violate_id = None
        vioation_knowledge_infos = db.vioation_knowledge_info.find({})
        for vioation_knowledge_info in vioation_knowledge_infos:
            keywords = vioation_knowledge_info['keyword1']
            keywords = keywords.split(' ')
            for keyword in keywords:
                if keyword and (keyword in title or keyword in content):
                    violate_id = vioation_knowledge_info['_id']
                    break

            if violate_id:
                break

        log.debug(
            '''
            title:        %s
            abstract :    %s
            imgUrl :      %s
            originalUrl:  %s
            releaseTime : %s
            videoMainUrl: %s
            videoUrl:     %s
            content :     %s
            columnId:     %d
            sensitive_id: %d
            violate_id:   %d

            ''' %
            (title, abstract, imgUrl, originalUrl, releaseTime, videoMainUrl,
             videoUrl, content, columnId, sensitive_id and sensitive_id
             or 0, violate_id and violate_id or 0))

        # 如果是视频栏 并且不包含敏感或违法信息 则不下载
        if columnId == Constance.VIDEO:
            if not sensitive_id and not violate_id:
                continue

        # 下载
        basePath = Constance.FILE_LOCAL_PATH
        isDownload = 0

        def callFunc():
            global isDownload
            isDownload = 1

        # 下载图片
        imgName = ''
        if imgUrl:
            imgName = 'iamges/' + tools.getCurrentDate(
                dateFormat='%Y-%m-%d') + "/" + tools.getCurrentDate(
                    dateFormat='%Y%m%d%H%M%S') + '.jpg'
            isDownload = tools.downloadFile(imgUrl, basePath, imgName,
                                            callFunc)
            if not isDownload:
                imgName = ''

        # 下载视频
        videoName = ''
        if videoUrl:
            videoName = 'videos/' + tools.getCurrentDate(
                dateFormat='%Y-%m-%d') + "/" + tools.getCurrentDate(
                    dateFormat='%Y%m%d%H%M%S') + '.mp4'
            isDownload = tools.downloadFile(videoUrl, basePath, videoName,
                                            callFunc)
            if not isDownload:
                videoName = ''

        if originalUrl:
            basePaser.addContentInfo(title, abstract, imgUrl, imgName,
                                     originalUrl, releaseTime, videoUrl,
                                     videoName, content, columnId, isDownload,
                                     sensitive_id, violate_id)

    basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #29

0

Datei anzeigen

def parseUrl(urlInfo):
    log.debug('处理 %s' % urlInfo)

    sourceUrl = urlInfo['url']
    depth = urlInfo['depth']
    websiteId = urlInfo['website_id']
    description = urlInfo['description']

    # 使用urlopen网页有时乱码 用get请求 然后设置编码解决了问题
    html = tools.getHtmlByGet(sourceUrl, '')

    if not DEBUG:
        if html == None:
            if sourceUrl == Constance.TENCENT:
                basePaser.updateUrl(sourceUrl, Constance.TODO)
            else:
                basePaser.updateUrl(sourceUrl, Constance.EXCEPTION)
            return

        regex = '[\u4e00-\u9fa5]+'
        chineseWord = tools.getInfo(html, regex)
        if not chineseWord:
            basePaser.updateUrl(sourceUrl, Constance.DONE)
            return

        # 取当前页面的全部url
        urls = tools.getUrls(html)

        # 过滤掉外链接 添加到数据库
        fitUrl = tools.fitUrl(urls, "qq.com")
        fitUrl = tools.filterRule(fitUrl, lineList)
        for url in fitUrl:
            # log.debug('url = ' + url)
            basePaser.addUrl(url, websiteId, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.getInfo(html, regexs)
    title = title and title[0] or ''
    title = tools.delHtmlTag(title)
    # 内容
    regexs = [
        'bossZone="content">(.+?)正文已结束.+?</span>',
        'id="articleContent">(.*?)<div class="hasc">'
    ]

    content = tools.getInfo(html, regexs)
    content = content and content[0] or ''
    content = tools.delHtmlTag(content)

    log.debug('''
                depth     = %d
                sourceUrl = %s
                title     = %s
                content   =  %s
             ''' % (depth, sourceUrl, title, content))

    if not DEBUG:
        if content and title:
            basePaser.addTextInfo(websiteId, sourceUrl, title, content)

        # 更新sourceUrl为done
        basePaser.updateUrl(sourceUrl, Constance.DONE)

Beispiel #30

0

Datei anzeigen

Datei: tencent.py Projekt: sx1616039/crawl

def parseLeafUrl(sourceUrl, websiteId):
    log.debug('解析 LeafNode url = %s begin...' % sourceUrl)

    try:
        driver = webdriver.PhantomJS()
        driver.get(sourceUrl)
        time.sleep(2)
        html = driver.page_source
    finally:
        driver.quit()

    #html = tools.getHtml(sourceUrl)
    #h = httplib.Http(timeout=3)
    #resp,content=h.request(sourceUrl)
    #html = content.decode('utf-8','ignore')
    if html == None:
        basePaser.updateUrl(sourceUrl, Constance.TODO)
        log.debug('未能正确获取此URL源码%s ！！！' % sourceUrl)
        return
        f = open('D:\cctv_html.txt', 'a+')
        f.write(sourceUrl)
        f.write('\n')
        f.close()
        return

    log.debug('URL=%s正则匹配详细信息开始。。。' % sourceUrl)
    # 专辑名称
    videoName = ''
    regExs = ['player_title">(.+?)[\s]*<']
    for reg in regExs:
        videoName = ''.join(tools.getInfo(html, reg))
        if videoName != '': break
    log.debug('专辑名称: %s' % videoName)

    # 集数
    videoNumber = ''
    regExs = ['专辑总数据.+共([\d]+?)个']
    for reg in regExs:
        videoNumber = ''.join(tools.getInfo(html, reg))
        if videoNumber != '': break
    log.debug('集数: %s' % videoNumber)

    # 简介
    videoDescription = ''
    regExs = ['itemprop="description" content="(.*?)">?']
    for reg in regExs:
        videoDescription = ''.join(tools.getInfo(html, reg))
        if videoDescription != '': break
    log.debug('简介: %s' % videoDescription)

    # 总播放量
    videoPlayCount = ''
    regExs = ['mod_album_total.+?total_count">总播放量.+?>(.*?)</em>']
    for reg in regExs:
        videoPlayCount = ''.join(tools.getInfo(html, reg))
        if videoPlayCount != '': break
    log.debug('总播放量: %s' % videoPlayCount)

    # url
    log.debug('URL = %s' % sourceUrl)

    # 总片长 (单位秒)
    videoAllTime = ''
    regExs = ['<span class="figure_info">(.*?)</span>']
    for reg in regExs:
        videoAllTime = ''.join(tools.timeListToString(tools.getInfo(html,
                                                                    reg)))
        if videoAllTime != '': break
    log.debug('总片长 : %s' % videoAllTime)

    # 发布时间
    videoReleaseTime = ''
    regExs = ['meta itemprop="datePublished" content="(.*?)"']
    for reg in regExs:
        videoReleaseTime = ''.join(tools.getInfo(html, reg))
        if videoReleaseTime != '': break
    log.debug('发布时间 : %s' % videoReleaseTime)

    # 播出机构
    videoPlayCompany = ''
    log.debug('播出机构暂无。。。')

    # 百度百科上的信息
    videoBaiduInfo = ''
    log.debug('百度百科上的信息暂无。。。')

    log.debug('URL=%s正则匹配详细信息结束。。。' % sourceUrl)

    basePaser.addDocumentary(websiteId, videoName, videoDescription, sourceUrl,
                             videoNumber, videoPlayCount, videoAllTime,
                             videoReleaseTime)
    basePaser.updateUrl(sourceUrl, Constance.DONE)