Esempi in Python per get_text, esempi in Python per utils.tools.get_text

Esempio n. 1

0

Mostra file

File: netdisk_parser.py Progetto: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html, requests = tools.get_html_by_requests(root_url, headers=HEADER)
    titles = tools.get_tag(
        html, 'div', {'id': tools.re.compile('id_cse_content_item_mid_.')})

    for i in range(0, len(titles)):
        try:
            url = tools.get_tag(titles[i].previous_sibling.previous_sibling,
                                'a',
                                find_all=False)
            url = url['href']

            html2 = tools.get_html_by_urllib(url)
            regexs = ['<title>(.+?)</title>']
            mark = ''.join(tools.get_info(html2, regexs))
            regexs = ['不存在', '取消']
            if not tools.get_info(mark, regexs):
                title = tools.get_text(
                    titles[i].previous_sibling.previous_sibling)
                title = tools.del_html_tag(title)
                info = tools.get_text(titles[i])

                file_name = tools.del_html_tag(''.join(
                    tools.get_info(info, '文件名:(.+?)文')))

                file_size = tools.del_html_tag(''.join(
                    tools.get_info(info, '文件大小:(.+?)分')))

                author = tools.del_html_tag(''.join(
                    tools.get_info(info, '分享者:(.+?)时')))

                release_time = ''.join(tools.get_info(info,
                                                      '时间:(.+?)下')).replace(
                                                          '\n', '')

                download_count = tools.del_html_tag(''.join(
                    tools.get_info(info, '下载次数:(.+?)\.')))

        except:
            continue

            log.debug('''
                标题：    %s
                文件大小：%s
                文件名字：%s
                作者：    %s
                原文url： %s
                下载数量：%s
                日期：    %s
                   ''' % (title, file_size, file_name, author, url,
                          download_count, release_time))

            contained_key, contained_key_count = base_parser.get_contained_key(
                title, '', remark['search_keyword1'],
                remark['search_keyword2'], remark['search_keyword3'])
            if not contained_key:
                continue

            base_parser.add_content_info('VA_content_info',
                                         SITE_ID,
                                         url,
                                         title,
                                         file_size=file_size,
                                         file_name=file_name,
                                         author=author,
                                         release_time=release_time,
                                         download_count=download_count,
                                         search_type=search_type,
                                         keyword=contained_key,
                                         keyword_count=contained_key_count,
                                         task_id=remark['task_id'])
    base_parser.update_url('VA_urls', root_url, Constance.DONE)

Esempio n. 2

0

Mostra file

File: baidu_parser.py Progetto: cash2one/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_webdirver(root_url)
    headers = tools.get_tag(html, 'h3', {'class': 't'})

    for i in range(0, len(headers)):
        title = tools.get_text(headers[i])
        title = tools.del_html_tag(title)
        if tools.re.compile('的相关视频在线观看_百度视频').findall(title):
            continue

        try:
            ssurl = headers[i].a["href"]
        except:
            continue
        r = tools.requests.head(ssurl)
        url = r.headers['Location']

        try:
            img = headers[i].next_sibling()[0].img['src']
        except:
            img = ''

        try:
            release_time = headers[i].next_sibling()[0]
            release_time = ''.join(
                tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                    str(release_time)))
            if not release_time:
                release_time = headers[i].next_sibling()[1]
                release_time = ''.join(
                    tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                        str(release_time)))
                if not release_time:
                    release_time = headers[i].next_sibling()[2]
                    release_time = ''.join(
                        tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                            str(release_time)))
                    if not release_time:
                        release_time = headers[i].next_sibling()[3]
                        release_time = ''.join(
                            tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                                str(release_time)))
            release_time = release_time.replace('年', '-').replace('月',
                                                                  '-').replace(
                                                                      '日', '')
        except:
            release_time = ''

        content = ''
        for content in headers[i].next_sibling():
            content = tools.get_tag(content,
                                    'div', {'class': 'c-abstract'},
                                    find_all=False)
            if content:
                content = tools.get_text(content)
                break
            else:
                content = ''

        log.debug('''
            标题：   %s
            内容：   %s
            原文url：%s
            图片url：%s
            日期：   %s
               ''' % (title, content, url, img, release_time))

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, content, remark['search_keyword1'],
            remark['search_keyword2'], remark['search_keyword3'])
        if not contained_key:
            continue

        is_video1 = base_parser.is_have_video_by_site(url)
        if not is_video1:
            is_video2 = base_parser.is_have_video_by_judge(title, content)
            if is_video2:
                html2 = tools.get_html_by_requests(url)
                is_video3 = base_parser.is_have_video_by_common(html2)
                if not is_video3:
                    continue
            else:
                continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url=url,
                                     title=title,
                                     content=content,
                                     image_url=img,
                                     release_time=release_time,
                                     search_type=search_type,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count)
    base_parser.update_url('VA_urls', root_url, Constance.DONE)

Esempio n. 3

0

Mostra file

File: magnet_parser.py Progetto: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html, requests = tools.get_html_by_requests(root_url)
    titles = tools.get_tag(html, 'h3')
    video_infos = tools.get_tag(html, 'dt')
    for i in range(0, len(titles)):
        title = tools.get_text(titles[i])
        title = tools.del_html_tag(title)
        try:
            url = titles[i].a['href']
        except:
            continue
        url = 'http://www.bturls.net' + url

        release_time = video_infos[i].span
        release_time = tools.get_text(release_time)

        file_size = video_infos[i].span.next_sibling.next_sibling
        file_size = tools.get_text(file_size)

        watched_count = video_infos[
            i].span.next_sibling.next_sibling.next_sibling.next_sibling
        watched_count = tools.get_text(watched_count)

        regexs = ['t/(.+?)\.']
        magnet_link = 'magnet:?xt=urn:btih:' + ''.join(
            tools.get_info(url, regexs))

        log.debug(
            '''
            标题：    %s
            文件大小：%s
            原文url： %s
            观看数量：%s
            磁力链接：%s
            日期：    %s
               ''' %
            (title, file_size, url, watched_count, magnet_link, release_time))

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, '', remark['search_keyword1'], remark['search_keyword2'],
            remark['search_keyword3'])
        if not contained_key:
            continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url,
                                     title,
                                     file_size=file_size,
                                     release_time=release_time,
                                     watched_count=watched_count,
                                     magnet_link=magnet_link,
                                     search_type=search_type,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count,
                                     task_id=remark['task_id'])
    base_parser.update_url('VA_urls', root_url, Constance.DONE)

Esempio n. 4

0

Mostra file

File: wechat_parser.py Progetto: cash2one/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    # 解析
    html, request = tools.get_html_by_requests(root_url, headers=HEADER)

    if not html:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    news_box = tools.get_tag(html, name='div', attrs={'class': "news-box"})[0]

    news_list = tools.get_tag(news_box, name='li')
    for news in news_list:
        try:
            # 图片
            image = tools.get_tag(news, name='img')[0]
            image = tools.get_json_value(image, 'src')

            # url
            url = tools.get_tag(news, name='h3')[0]
            try:
                url = tools.get_json_value(url.a, 'href')
            except:
                url = ''

            # 标题
            title = tools.get_tag(news, name='h3')[0]
            title = tools.get_text(title)
            title = tools.del_html_tag(title)

            # 内容
            content = tools.get_tag(news,
                                    name='p',
                                    attrs={'class': "txt-info"})[0]
            content = tools.get_text(content)
            content = tools.del_html_tag(content)

            # 观看数
            watched_count = ''

            # 来源
            origin = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0]
            origin = ''.join(tools.get_info(origin, '<a.*?>(.*?)<'))

            # 日期
            release_time = tools.get_tag(news,
                                         name='div',
                                         attrs={'class': "s-p"})[0]
            release_time = tools.get_json_value(release_time, 't')
            release_time = tools.timestamp_to_date(int(release_time))

            # 判断是否有视频 根据视频播放图标判断
            regex = '<div class="img-box">.*?<i></i>.*?</div>'
            play_icon = tools.get_info(news, regex)

        except:
            continue

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, content, remark['search_keyword1'],
            remark['search_keyword2'], remark['search_keyword3'])

        log.debug(
            '''
            标题：   %s
            内容：   %s
            来源：   %s
            原文url：%s
            图片url：%s
            观看数： %s
            日期：   %s
            有视频： %d
            关键词： %s
            关键词数:%s
                  ''' %
            (title, content, origin, url, image, watched_count, release_time,
             play_icon and True or False, contained_key, contained_key_count))

        if not contained_key or not play_icon:
            continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url,
                                     title,
                                     content,
                                     image_url=image,
                                     release_time=release_time,
                                     origin=origin,
                                     watched_count=watched_count,
                                     search_type=SEARCH_TYPE,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count)

    base_parser.update_url('VA_urls', root_url, Constance.DONE)