Python add_content_infoの例、base.base_parser.add_content_info Pythonの例

コード例 #1

0

ファイルを表示

ファイル: soubaidupan_parser.py プロジェクト: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_urllib(root_url)
    title = '<tr height="25"><td><a href=".*?"  title="(.*?)"'
    video_url = ['<tr height="25"><td><a href="(.*?)"']
    author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>']
    watched_count = ['浏览次数: </span>(.*?)&nbsp']
    file_size = ['资料大小: </span>(.*?)&nbsp']
    download_count = ['下载次数: </span>(.*?)&nbsp']

    titles = tools.get_info(html, title, allow_repeat = True)
    video_urls = tools.get_info(html, video_url, allow_repeat = True)
    authors = tools.get_info(html, author, allow_repeat = True)
    watched_counts = tools.get_info(html, watched_count, allow_repeat = True)
    file_sizes = tools.get_info(html, file_size, allow_repeat= True)
    download_counts = tools.get_info(html, download_count, allow_repeat = True)


    for i in range(len(titles)):
        title = titles[i]
        title = tools.del_html_tag(title)

        video_url = video_urls[i]
        video_url = tools.get_full_url('http://www.sobaidupan.com', video_url)

        author = authors[i]
        watched_count = watched_counts[i]
        file_size = file_sizes[i]
        download_count = download_counts[i]

        log.debug('''
            标题：    %s
            视频地址： %s
            作者：    %s
            观看数    %s
            资料大小  %s
            下载次数  %s
        '''%(title, video_url, author, watched_count, file_size, download_count))

        contained_key, contained_key_count = base_parser.get_contained_key(title, '',
                                                            remark['search_keyword1'],
                                                            remark['search_keyword2'], remark['search_keyword3'])
        if not contained_key:
            continue

        base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size,
                                     file_name = title, author = author, watched_count = watched_count,
                                     download_count = download_count, search_type = search_type,
                                     keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id'])

    base_parser.update_url('VA_urls', root_url, Constance.DONE)

コード例 #2

0

ファイルを表示

ファイル: netdisk_parser.py プロジェクト: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html, requests = tools.get_html_by_requests(root_url, headers=HEADER)
    titles = tools.get_tag(
        html, 'div', {'id': tools.re.compile('id_cse_content_item_mid_.')})

    for i in range(0, len(titles)):
        try:
            url = tools.get_tag(titles[i].previous_sibling.previous_sibling,
                                'a',
                                find_all=False)
            url = url['href']

            html2 = tools.get_html_by_urllib(url)
            regexs = ['<title>(.+?)</title>']
            mark = ''.join(tools.get_info(html2, regexs))
            regexs = ['不存在', '取消']
            if not tools.get_info(mark, regexs):
                title = tools.get_text(
                    titles[i].previous_sibling.previous_sibling)
                title = tools.del_html_tag(title)
                info = tools.get_text(titles[i])

                file_name = tools.del_html_tag(''.join(
                    tools.get_info(info, '文件名:(.+?)文')))

                file_size = tools.del_html_tag(''.join(
                    tools.get_info(info, '文件大小:(.+?)分')))

                author = tools.del_html_tag(''.join(
                    tools.get_info(info, '分享者:(.+?)时')))

                release_time = ''.join(tools.get_info(info,
                                                      '时间:(.+?)下')).replace(
                                                          '\n', '')

                download_count = tools.del_html_tag(''.join(
                    tools.get_info(info, '下载次数:(.+?)\.')))

        except:
            continue

            log.debug('''
                标题：    %s
                文件大小：%s
                文件名字：%s
                作者：    %s
                原文url： %s
                下载数量：%s
                日期：    %s
                   ''' % (title, file_size, file_name, author, url,
                          download_count, release_time))

            contained_key, contained_key_count = base_parser.get_contained_key(
                title, '', remark['search_keyword1'],
                remark['search_keyword2'], remark['search_keyword3'])
            if not contained_key:
                continue

            base_parser.add_content_info('VA_content_info',
                                         SITE_ID,
                                         url,
                                         title,
                                         file_size=file_size,
                                         file_name=file_name,
                                         author=author,
                                         release_time=release_time,
                                         download_count=download_count,
                                         search_type=search_type,
                                         keyword=contained_key,
                                         keyword_count=contained_key_count,
                                         task_id=remark['task_id'])
    base_parser.update_url('VA_urls', root_url, Constance.DONE)

コード例 #3

0

ファイルを表示

ファイル: magnet_parser.py プロジェクト: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html, requests = tools.get_html_by_requests(root_url)
    titles = tools.get_tag(html, 'h3')
    video_infos = tools.get_tag(html, 'dt')
    for i in range(0, len(titles)):
        title = tools.get_text(titles[i])
        title = tools.del_html_tag(title)
        try:
            url = titles[i].a['href']
        except:
            continue
        url = 'http://www.bturls.net' + url

        release_time = video_infos[i].span
        release_time = tools.get_text(release_time)

        file_size = video_infos[i].span.next_sibling.next_sibling
        file_size = tools.get_text(file_size)

        watched_count = video_infos[
            i].span.next_sibling.next_sibling.next_sibling.next_sibling
        watched_count = tools.get_text(watched_count)

        regexs = ['t/(.+?)\.']
        magnet_link = 'magnet:?xt=urn:btih:' + ''.join(
            tools.get_info(url, regexs))

        log.debug(
            '''
            标题：    %s
            文件大小：%s
            原文url： %s
            观看数量：%s
            磁力链接：%s
            日期：    %s
               ''' %
            (title, file_size, url, watched_count, magnet_link, release_time))

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, '', remark['search_keyword1'], remark['search_keyword2'],
            remark['search_keyword3'])
        if not contained_key:
            continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url,
                                     title,
                                     file_size=file_size,
                                     release_time=release_time,
                                     watched_count=watched_count,
                                     magnet_link=magnet_link,
                                     search_type=search_type,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count,
                                     task_id=remark['task_id'])
    base_parser.update_url('VA_urls', root_url, Constance.DONE)

コード例 #4

0

ファイルを表示

ファイル: baidu_parser.py プロジェクト: cash2one/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_webdirver(root_url)
    headers = tools.get_tag(html, 'h3', {'class': 't'})

    for i in range(0, len(headers)):
        title = tools.get_text(headers[i])
        title = tools.del_html_tag(title)
        if tools.re.compile('的相关视频在线观看_百度视频').findall(title):
            continue

        try:
            ssurl = headers[i].a["href"]
        except:
            continue
        r = tools.requests.head(ssurl)
        url = r.headers['Location']

        try:
            img = headers[i].next_sibling()[0].img['src']
        except:
            img = ''

        try:
            release_time = headers[i].next_sibling()[0]
            release_time = ''.join(
                tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                    str(release_time)))
            if not release_time:
                release_time = headers[i].next_sibling()[1]
                release_time = ''.join(
                    tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                        str(release_time)))
                if not release_time:
                    release_time = headers[i].next_sibling()[2]
                    release_time = ''.join(
                        tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                            str(release_time)))
                    if not release_time:
                        release_time = headers[i].next_sibling()[3]
                        release_time = ''.join(
                            tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                                str(release_time)))
            release_time = release_time.replace('年', '-').replace('月',
                                                                  '-').replace(
                                                                      '日', '')
        except:
            release_time = ''

        content = ''
        for content in headers[i].next_sibling():
            content = tools.get_tag(content,
                                    'div', {'class': 'c-abstract'},
                                    find_all=False)
            if content:
                content = tools.get_text(content)
                break
            else:
                content = ''

        log.debug('''
            标题：   %s
            内容：   %s
            原文url：%s
            图片url：%s
            日期：   %s
               ''' % (title, content, url, img, release_time))

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, content, remark['search_keyword1'],
            remark['search_keyword2'], remark['search_keyword3'])
        if not contained_key:
            continue

        is_video1 = base_parser.is_have_video_by_site(url)
        if not is_video1:
            is_video2 = base_parser.is_have_video_by_judge(title, content)
            if is_video2:
                html2 = tools.get_html_by_requests(url)
                is_video3 = base_parser.is_have_video_by_common(html2)
                if not is_video3:
                    continue
            else:
                continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url=url,
                                     title=title,
                                     content=content,
                                     image_url=img,
                                     release_time=release_time,
                                     search_type=search_type,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count)
    base_parser.update_url('VA_urls', root_url, Constance.DONE)

コード例 #5

0

ファイルを表示

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
        'Cookie':
        '_T_WM=6f2675cab22ec5e018673192b46dd73b; SUB=_2A251adqIDeRxGeNO7FER8ivPyziIHXVWlebArDV6PUJbkdANLXHjkW2Eu3heWA8h0QMZFxI0_fe7-s2Isw..; SUHB=0HThVFDsKbumHU; SCF=AvoSYQqv89TMIxx4YQUcoIdBp2-sjJbx28qHTTnKAHOymGxToTyDJijAZJl_Nqe3ve0x2U-Yk5poeuVn7bSqyt0.; M_WEIBOCN_PARAMS=featurecode%3D20000180%26oid%3D4060829337409043%26luicode%3D10000011%26lfid%3D106003type%253D1%26fid%3D100103type%253D1%2526q%253D%25E5%25A5%25B3%25E4%25B8%25BB%25E6%2592%25AD%26uicode%3D10000011',
        'Host':
        'm.weibo.cn',
        'Accept':
        'application/json, text/plain, */*',
        'Accept-Encoding':
        'gzip, deflate, sdch, br',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Connection':
        'keep-alive',
        'X-Requested-With':
        'XMLHttpRequest',
        'Referer':
        'https://m.weibo.cn/p/100103type%3D1%26q%3D%E5%A5%B3%E4%B8%BB%E6%92%AD?type=all&queryVal=%E5%A5%B3%E4%B8%BB%E6%92%AD&luicode=10000011&lfid=106003type%3D1&title=%E5%A5%B3%E4%B8%BB%E6%92%AD'
    }
    resp = tools.requests.get('%s' % root_url, headers=headers)
    infos = resp.json()
    try:
        cards = infos['cards']
    except:
        base_parser.update_url('VA_urls', root_url, Constance.DONE)
        return

    for i in cards:
        try:
            card_group = i['card_group']
            # print(card_group)
            # print('****************************')
            # print('****************************')
            for group in card_group:
                if group['mblog']:
                    mblog = group['mblog']
                else:
                    continue
                if not get_video_url(mblog):
                    continue
                url = get_url(mblog)
                page_url = get_page_url(mblog)
                origin = get_origin(mblog)
                content = get_content(mblog)
                reposts_count = get_reposts_count(mblog)
                comment_count = get_comments_count(mblog)
                attitudes_count = get_attitudes_count(mblog)
                author = get_author(mblog)
                image_url = get_image_url(mblog)
                release_time = get_release_time(mblog)
                video_url = get_video_url(mblog)

                log.debug('''
                    内容：   %s
                    原文url：%s
                    作者：   %s
                    来源：   %s
                    视频封面:%s
                    视频地址:%s
                    日期：   %s
                    转发数： %s
                    评论数： %s
                    点赞数： %s
                         ''' % (content, url, author, origin, image_url,
                                video_url, release_time, str(reposts_count),
                                str(comment_count), str(attitudes_count)))

                contained_key, contained_key_count = base_parser.get_contained_key(
                    '', content, remark['search_keyword1'],
                    remark['search_keyword2'], remark['search_keyword3'])
                if not contained_key:
                    continue

                base_parser.add_content_info('VA_content_info',
                                             SITE_ID,
                                             url=video_url,
                                             release_time=release_time,
                                             origin=origin,
                                             title=content,
                                             reposts_count=reposts_count,
                                             comment_count=comment_count,
                                             attitudes_count=attitudes_count,
                                             author=author,
                                             image_url=image_url,
                                             video_url=video_url,
                                             search_type=search_type,
                                             keyword=contained_key,
                                             keyword_count=contained_key_count,
                                             task_id=remark['task_id'])
        except:
            pass
    base_parser.update_url('VA_urls', root_url, Constance.DONE)

コード例 #6

0

ファイルを表示

ファイル: douban_parser.py プロジェクト: cash2one/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_urllib(root_url)
    regex = '<tr class="item">(.*?)</tr>'
    infos = tools.get_info(html, regex)
    for info in infos:
        title = ['<a href=".*?" onclick=.*?; class="">(.*?)</a>']
        title = tools.get_info(info, title, allow_repeat=True)
        title = ''.join(title)
        title = tools.del_html_tag(title)

        video_url = ['<a href="(.*?)" onclick=.*?; class="">']
        video_url = tools.get_info(info, video_url, allow_repeat=True)
        video_url = ''.join(video_url)

        comment_count = ['<span class="pl">\((\d*?)人评价\)</span>']
        comment_count = tools.get_info(info, comment_count, allow_repeat=True)
        comment_count = ''.join(comment_count)
        comment_count = int(comment_count) if comment_count else 0

        release_time = '<p class="pl">(\d{4}-\d{2}-\d{2}).*?</p>'
        release_time = tools.get_info(info, release_time, allow_repeat=True)
        release_time = ''.join(release_time)

        image_url = '<img src="(.*?)" alt=".*?" class=""/>'
        image_url = tools.get_info(info, image_url, allow_repeat=True)
        image_url = ''.join(image_url)

        content = '<p class="pl">(.*?)</p>'
        content = tools.get_info(info, content, allow_repeat=True)
        content = ''.join(content)

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, content, remark['search_keyword1'],
            remark['search_keyword2'], remark['search_keyword3'])
        log.debug('''
        标题:     %s
        内容:     %s
        评论数    %d
        视频地址  %s
        图片地址  %s
        发布时间  %s
        关键字：  %s
        关键字数：%d
        ''' % (title, content, comment_count, video_url, image_url,
               release_time, contained_key, contained_key_count))

        if not contained_key:
            continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url=video_url,
                                     title=title,
                                     content=content,
                                     image_url=image_url,
                                     comment_count=comment_count,
                                     release_time=release_time,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count)

    base_parser.update_url('VA_urls', root_url, Constance.DONE)

コード例 #7

0

ファイルを表示

ファイル: wechat_parser.py プロジェクト: cash2one/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    # 解析
    html, request = tools.get_html_by_requests(root_url, headers=HEADER)

    if not html:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    news_box = tools.get_tag(html, name='div', attrs={'class': "news-box"})[0]

    news_list = tools.get_tag(news_box, name='li')
    for news in news_list:
        try:
            # 图片
            image = tools.get_tag(news, name='img')[0]
            image = tools.get_json_value(image, 'src')

            # url
            url = tools.get_tag(news, name='h3')[0]
            try:
                url = tools.get_json_value(url.a, 'href')
            except:
                url = ''

            # 标题
            title = tools.get_tag(news, name='h3')[0]
            title = tools.get_text(title)
            title = tools.del_html_tag(title)

            # 内容
            content = tools.get_tag(news,
                                    name='p',
                                    attrs={'class': "txt-info"})[0]
            content = tools.get_text(content)
            content = tools.del_html_tag(content)

            # 观看数
            watched_count = ''

            # 来源
            origin = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0]
            origin = ''.join(tools.get_info(origin, '<a.*?>(.*?)<'))

            # 日期
            release_time = tools.get_tag(news,
                                         name='div',
                                         attrs={'class': "s-p"})[0]
            release_time = tools.get_json_value(release_time, 't')
            release_time = tools.timestamp_to_date(int(release_time))

            # 判断是否有视频 根据视频播放图标判断
            regex = '<div class="img-box">.*?<i></i>.*?</div>'
            play_icon = tools.get_info(news, regex)

        except:
            continue

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, content, remark['search_keyword1'],
            remark['search_keyword2'], remark['search_keyword3'])

        log.debug(
            '''
            标题：   %s
            内容：   %s
            来源：   %s
            原文url：%s
            图片url：%s
            观看数： %s
            日期：   %s
            有视频： %d
            关键词： %s
            关键词数:%s
                  ''' %
            (title, content, origin, url, image, watched_count, release_time,
             play_icon and True or False, contained_key, contained_key_count))

        if not contained_key or not play_icon:
            continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url,
                                     title,
                                     content,
                                     image_url=image,
                                     release_time=release_time,
                                     origin=origin,
                                     watched_count=watched_count,
                                     search_type=SEARCH_TYPE,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count)

    base_parser.update_url('VA_urls', root_url, Constance.DONE)