Ejemplo n.º 1
0
def test_sanitize():
    resp = requests.get('https://inosmi.ru/economic/20190629/245384784.html')
    resp.raise_for_status()
    clean_text, title = sanitize(resp.text)

    assert clean_text.startswith('<article>')
    assert clean_text.endswith('</article>')
    assert "<title>\n USA Today (США): Трамп пообещал" in title
    assert 'В субботу, 29 июня, президент США Дональд Трамп' in clean_text
    assert 'За несколько часов до встречи с Си' in clean_text

    assert '<img src="' in clean_text
    assert '<a href="' in clean_text
    assert '<h1>' in clean_text

    clean_plaintext, title = sanitize(resp.text, plaintext=True)

    assert "<title>" not in title
    assert "USA Today (США): Трамп пообещал «пока» " in title
    assert 'В субботу, 29 июня, президент США Дональд Трамп' in clean_plaintext
    assert 'За несколько часов до встречи с Си' in clean_plaintext

    assert '<img src="' not in clean_plaintext
    assert '<a href="' not in clean_plaintext
    assert '<h1>' not in clean_plaintext
    assert '</article>' not in clean_plaintext
    assert '<h1>' not in clean_plaintext
Ejemplo n.º 2
0
async def process_article(session,
                          morph,
                          charged_words,
                          url,
                          fetch_timeout=1.5,
                          split_timeout=3):
    score = None
    words_count = None
    try:
        async with timeout(fetch_timeout):
            html = await fetch(session, url)
        sanitized_html = sanitize(html, plaintext=True)
        with count_time():
            async with timeout(split_timeout):
                article_words = await split_by_words(morph, sanitized_html)
        score = calculate_jaundice_rate(article_words, charged_words)
        words_count = len(article_words)
        status = ProcessingStatus.OK
    except ClientConnectionError:
        status = ProcessingStatus.FETCH_ERROR
    except ArticleNotFound:
        status = ProcessingStatus.PARSING_ERROR
    except asyncio.TimeoutError:
        status = ProcessingStatus.TIMEOUT
    return (status, url, score, words_count)
Ejemplo n.º 3
0
async def process_article(session: aiohttp.ClientSession,
                          morph: pymorphy2.MorphAnalyzer,
                          charged_words: list,
                          url: str,
                          results: list,
                          timeout: int = 10) -> dict:
    try:
        async with async_timeout(timeout):
            html = await fetch(session, url)
            article = sanitize(html)
            with measure_time():
                words = await split_by_words(morph, article)
            rating = calculate_jaundice_rate(words, charged_words)
            words_count = len(words)
            status = ProcessingStatus.OK
    except aiohttp.ClientError:
        rating = None
        words_count = None
        status = ProcessingStatus.FETCH_ERROR
    except ArticleNotFound:
        rating = None
        words_count = None
        status = ProcessingStatus.PARSING_ERROR
    except asyncio.TimeoutError:
        rating = None
        words_count = None
        status = ProcessingStatus.TIMEOUT_ERROR

    results.append({
        'rating': rating,
        'words_count': words_count,
        'status': status.value,
        'url': url
    })
Ejemplo n.º 4
0
async def process_article(url, articles_data, processed_max_time=3):
    article_info = {
        'status': None,
        'url': url,
        'words_count': None,
        'score': None,
    }
    async with aiohttp.ClientSession() as session:
        with managed_time_processs() as timer_process:
            try:
                async with timeout(processed_max_time) as cm:
                    html = await fetch(session, url)
                    sanitized_html = sanitize(html)
                    article_words = await text_tools.split_by_words(
                        morph, sanitized_html)
                    charged_words = fetch_charged_words(CHARGED_WORDS_FILE)
                    article_info['status'] = ProcessingStatus.OK.value
                    article_info['words_count'] = len(article_words)
                    article_info['score'] = text_tools.calculate_jaundice_rate(
                        article_words, charged_words)

            except adapters.ArticleNotFound:
                article_info['status'] = ProcessingStatus.PARSING_ERROR.value

            except asyncio.TimeoutError:
                article_info['status'] = ProcessingStatus.TIMEOUT.value

            except aiohttp.ClientResponseError:
                article_info['status'] = ProcessingStatus.FETCH_ERROR.value

        articles_data.append(article_info)
Ejemplo n.º 5
0
async def process_article(
        session: aiohttp.ClientSession,
        url: str,
        charged_words: list,
        morph: pymorphy2.MorphAnalyzer,
        results: list,
        max_timeout: int = constants.ASYNC_TIMEOUT,
):
    about = {
        "URL": url,
        "Рейтинг": None,
        "Слов в статье": None,
        "Статус": None,
    }
    try:
        async with timeout(max_timeout):
            html = await fetch(session=session, url=url)
            text = sanitize(html=html, plaintext=True)
        async with measure_execution_time():
            async with timeout(max_timeout):
                score = await score_text(morph=morph, text=text, negative=charged_words)

    except aiohttp.ClientError:
        about['Статус'] = statuses.ProcessingStatus.FETCH_ERROR.value
    except adapters.ArticleNotFound:
        about['Статус'] = statuses.ProcessingStatus.PARSING_ERROR.value
    except asyncio.TimeoutError:
        about['Статус'] = statuses.ProcessingStatus.TIMEOUT.value
    else:
        about['Статус'] = statuses.ProcessingStatus.OK.value
        about['Рейтинг'] = score
        about['Слов в статье'] = len(text)

    results.append(about)
Ejemplo n.º 6
0
async def process_article(article,
                          morph,
                          session,
                          charged_words,
                          fetch_timeout=3,
                          process_timeout=3):
    try:
        async with timeout(fetch_timeout):
            html = await fetch(session, article)
        clean_text = inosmi_ru.sanitize(html, True)
        with work_timer():
            async with timeout(process_timeout):
                words = await text_tools.split_by_words(morph, clean_text)
            rate = text_tools.calculate_jaundice_rate(words, charged_words)
        status = ProcessingStatus.OK.value
        return process_result(status, article, rate, len(words))
    except inosmi_ru.ArticleNotFound as e:
        status = ProcessingStatus.PARSING_ERROR.value
        return process_result(status, article)
    except (aiohttp.ClientConnectorError, aiohttp.ClientResponseError) as e:
        status = ProcessingStatus.FETCH_ERROR.value
        return process_result(status, article)
    except asyncio.TimeoutError as e:
        status = ProcessingStatus.TIMEOUT.value
        return process_result(status, article)
Ejemplo n.º 7
0
async def process_article(session,
                          morph,
                          charged_words,
                          url,
                          results,
                          max_timeout=ASYNC_TIMEOUT):
    """Анализ статьи на 'желтушность."""
    async with measure_time():
        score = None
        words_count = None
        try:
            async with timeout(max_timeout):
                html = await fetch(session, url)
            status = ProcessingStatus.OK
            article = inosmi_ru.sanitize(html)
            async with timeout(max_timeout):
                words = await split_by_words(morph, article)
                score = calculate_jaundice_rate(words, charged_words)
            words_count = len(words)
        except (ClientError, InvalidURL):
            status = ProcessingStatus.FETCH_ERROR
        except exceptions.ArticleNotFound:
            status = ProcessingStatus.PARSING_ERROR
        except asyncio.exceptions.TimeoutError:
            status = ProcessingStatus.TIMEOUT

        results.append({
            'status': status.value,
            'url': url,
            'rating': score,
            'words_count': words_count
        })
Ejemplo n.º 8
0
async def process_article(session, morph, charged_words, url):
    result = {
        'title': None,
        'status': None,
        'score': None,
        'words_count': None
    }

    try:
        await check_for_available_parse(url)
    except ArticleNotFound as exc:
        result.update({
            'title': f'{exc}',
            'status': ProcessingStatus.PARSING_ERROR.value
        })
        return result

    try:
        async with timeout(5):
            html = await fetch(url, session)
    except InvalidURL as e:
        result.update({
            'title': f'URL {e} Does not exist',
            'status': ProcessingStatus.FETCH_ERROR.value
        })
        return result
    except (ClientError, ClientConnectorError):
        result.update({
            'title': 'Connection error',
            'status': ProcessingStatus.FETCH_ERROR.value
        })
        return result
    except asyncio.TimeoutError:
        result.update({
            'title': 'TimeOut error',
            'status': ProcessingStatus.TIMEOUT.value
        })
        return result

    sanitazed_text, title = sanitize(html, plaintext=True)
    async with process_split_by_words(
            morph, sanitazed_text) as (splited_text, execution_time, error):
        if error:
            result.update({
                'title': title,
                'status': ProcessingStatus.TIMEOUT.value
            })
            return result
        score = calculate_jaundice_rate(splited_text, charged_words)
        logging.info(f'Анализ статьи произведен за {execution_time:.2f} сек.')
        result.update({
            'title': title,
            'status': ProcessingStatus.OK.value,
            'score': score,
            'words_count': len(splited_text)
        })
    return result
Ejemplo n.º 9
0
def test_sanitize(inosmi_good_html):
    clean_text = sanitize(inosmi_good_html)
    assert clean_text.startswith('<article>')
    assert clean_text.endswith('</article>')
    assert 'В субботу, 29 июня, президент США Дональд Трамп' in clean_text
    assert 'За несколько часов до встречи с Си' in clean_text

    assert '<img src="' in clean_text
    assert '<a href="' in clean_text
    assert '<h1>' in clean_text
Ejemplo n.º 10
0
def test_sanitize_plain_text(inosmi_good_html):
    clean_plaintext = sanitize(inosmi_good_html, plaintext=True)
    assert 'В субботу, 29 июня, президент США Дональд Трамп' in clean_plaintext
    assert 'За несколько часов до встречи с Си' in clean_plaintext

    assert '<img src="' not in clean_plaintext
    assert '<a href="' not in clean_plaintext
    assert '<h1>' not in clean_plaintext
    assert '</article>' not in clean_plaintext
    assert '<h1>' not in clean_plaintext
Ejemplo n.º 11
0
async def process_article(session, morph, charged_words, url, analyze_results, fetch_timeout=TIMEOUT):
    title = 'URL not exist'
    jaundice_rating = words_amount = None
    status = ProcessingStatus.OK

    try:
        async with timeout(fetch_timeout):
            html = await fetch(session, url)

        article_soup = BeautifulSoup(html, 'html.parser')
        title = article_soup.find('title').string

        article_text = sanitize(html, plaintext=True)
        with runtime_measurement():
            splited_text = await split_by_words(morph, article_text)
        words_amount = len(splited_text)

        jaundice_rating = calculate_jaundice_rate(splited_text, charged_words)

    except aiohttp.ClientError:
        status = ProcessingStatus.FETCH_ERROR

    except adapters.ArticleNotFound:
        domain_pattern = r'(^http[s]:\/\/)?(?P<domain>\w+\.\w+)'
        match = re.match(domain_pattern, url)
        title = f'Статья с сайта {match.group("domain")}'
        status = ProcessingStatus.PARSING_ERROR

    except asyncio.TimeoutError:
        status = ProcessingStatus.TIMEOUT

    analyze_result = {
        'title': title,
        'status': status.value,
        'rating': jaundice_rating,
        'words_amount': words_amount
    }

    analyze_results.append(analyze_result)

    return analyze_result
Ejemplo n.º 12
0
def test_sanitize_wrong_url(inosmi_bad_html):
    with pytest.raises(ArticleNotFound):
        sanitize(inosmi_bad_html)
Ejemplo n.º 13
0
def test_sanitize_wrong_url():
    resp = requests.get('http://example.com')
    resp.raise_for_status()
    with pytest.raises(ArticleNotFound):
        sanitize(resp.text)