コード例 #1
0
ファイル: article_tools.py プロジェクト: qDes/jaundice-rate
async def process_article(session,
                          morph,
                          charged_words,
                          url,
                          fetch_timeout=1.5,
                          split_timeout=3):
    score = None
    words_count = None
    try:
        async with timeout(fetch_timeout):
            html = await fetch(session, url)
        sanitized_html = sanitize(html, plaintext=True)
        with count_time():
            async with timeout(split_timeout):
                article_words = await split_by_words(morph, sanitized_html)
        score = calculate_jaundice_rate(article_words, charged_words)
        words_count = len(article_words)
        status = ProcessingStatus.OK
    except ClientConnectionError:
        status = ProcessingStatus.FETCH_ERROR
    except ArticleNotFound:
        status = ProcessingStatus.PARSING_ERROR
    except asyncio.TimeoutError:
        status = ProcessingStatus.TIMEOUT
    return (status, url, score, words_count)
コード例 #2
0
async def process_article(url, articles_data, processed_max_time=3):
    article_info = {
        'status': None,
        'url': url,
        'words_count': None,
        'score': None,
    }
    async with aiohttp.ClientSession() as session:
        with managed_time_processs() as timer_process:
            try:
                async with timeout(processed_max_time) as cm:
                    html = await fetch(session, url)
                    sanitized_html = sanitize(html)
                    article_words = await text_tools.split_by_words(
                        morph, sanitized_html)
                    charged_words = fetch_charged_words(CHARGED_WORDS_FILE)
                    article_info['status'] = ProcessingStatus.OK.value
                    article_info['words_count'] = len(article_words)
                    article_info['score'] = text_tools.calculate_jaundice_rate(
                        article_words, charged_words)

            except adapters.ArticleNotFound:
                article_info['status'] = ProcessingStatus.PARSING_ERROR.value

            except asyncio.TimeoutError:
                article_info['status'] = ProcessingStatus.TIMEOUT.value

            except aiohttp.ClientResponseError:
                article_info['status'] = ProcessingStatus.FETCH_ERROR.value

        articles_data.append(article_info)
コード例 #3
0
async def process_article(session, morph, charged_words, url):
    try:
        html = await fetch(session, url)
        link_fetched = True
        status = ProcessingStatus.OK
        clean_plaintext = sanitize(html, plaintext=True)

    except (SocksError, aiohttp.ClientError):
        link_fetched = False
        status = ProcessingStatus.FETCH_ERROR

    except ArticleNotFound:
        link_fetched = False
        status = ProcessingStatus.PARSING_ERROR

    except asyncio.TimeoutError:
        link_fetched = False
        status = ProcessingStatus.TIMEOUT

    if link_fetched:
        try:
            async with async_timeout.timeout(3):
                article_words = await split_by_words(morph, clean_plaintext)
            words_count = len(article_words)
            score = calculate_jaundice_rate(article_words, charged_words)
        except asyncio.TimeoutError:
            status = ProcessingStatus.TIMEOUT
            words_count = score = None
    else:
        words_count = score = None

    return url, status, score, words_count
コード例 #4
0
async def process_article(article,
                          morph,
                          session,
                          charged_words,
                          fetch_timeout=3,
                          process_timeout=3):
    try:
        async with timeout(fetch_timeout):
            html = await fetch(session, article)
        clean_text = inosmi_ru.sanitize(html, True)
        with work_timer():
            async with timeout(process_timeout):
                words = await text_tools.split_by_words(morph, clean_text)
            rate = text_tools.calculate_jaundice_rate(words, charged_words)
        status = ProcessingStatus.OK.value
        return process_result(status, article, rate, len(words))
    except inosmi_ru.ArticleNotFound as e:
        status = ProcessingStatus.PARSING_ERROR.value
        return process_result(status, article)
    except (aiohttp.ClientConnectorError, aiohttp.ClientResponseError) as e:
        status = ProcessingStatus.FETCH_ERROR.value
        return process_result(status, article)
    except asyncio.TimeoutError as e:
        status = ProcessingStatus.TIMEOUT.value
        return process_result(status, article)
コード例 #5
0
ファイル: article.py プロジェクト: xdass/article_score
async def process_article(session,
                          morph,
                          charged_words,
                          url,
                          results,
                          max_timeout=ASYNC_TIMEOUT):
    """Анализ статьи на 'желтушность."""
    async with measure_time():
        score = None
        words_count = None
        try:
            async with timeout(max_timeout):
                html = await fetch(session, url)
            status = ProcessingStatus.OK
            article = inosmi_ru.sanitize(html)
            async with timeout(max_timeout):
                words = await split_by_words(morph, article)
                score = calculate_jaundice_rate(words, charged_words)
            words_count = len(words)
        except (ClientError, InvalidURL):
            status = ProcessingStatus.FETCH_ERROR
        except exceptions.ArticleNotFound:
            status = ProcessingStatus.PARSING_ERROR
        except asyncio.exceptions.TimeoutError:
            status = ProcessingStatus.TIMEOUT

        results.append({
            'status': status.value,
            'url': url,
            'rating': score,
            'words_count': words_count
        })
コード例 #6
0
async def process_article(session: aiohttp.ClientSession,
                          morph: pymorphy2.MorphAnalyzer,
                          charged_words: list,
                          url: str,
                          results: list,
                          timeout: int = 10) -> dict:
    try:
        async with async_timeout(timeout):
            html = await fetch(session, url)
            article = sanitize(html)
            with measure_time():
                words = await split_by_words(morph, article)
            rating = calculate_jaundice_rate(words, charged_words)
            words_count = len(words)
            status = ProcessingStatus.OK
    except aiohttp.ClientError:
        rating = None
        words_count = None
        status = ProcessingStatus.FETCH_ERROR
    except ArticleNotFound:
        rating = None
        words_count = None
        status = ProcessingStatus.PARSING_ERROR
    except asyncio.TimeoutError:
        rating = None
        words_count = None
        status = ProcessingStatus.TIMEOUT_ERROR

    results.append({
        'rating': rating,
        'words_count': words_count,
        'status': status.value,
        'url': url
    })
コード例 #7
0
ファイル: process.py プロジェクト: wenny17/fake_news_analyzer
async def process_article(session: aiohttp.ClientSession,
                          morph: pymorphy2.MorphAnalyzer,
                          charged_words: List[str],
                          url: str,
                          timeout=PROCESSING_TIMEOUT) -> dict:
    score, words_count = None, None
    try:
        async with async_timeout(RESPONSE_TIMEOUT):
            html = await fetch(session, url)

        text = sanitize_html(html, url)

        async with timing_manager(timeout):
            words = await split_by_words(morph, text)

        score = calculate_jaundice_rate(words, charged_words)
        words_count = len(words)
        status = ProcessingStatus.OK

    except aiohttp.ClientError:
        status = ProcessingStatus.FETCH_ERROR

    except asyncio.TimeoutError:
        status = ProcessingStatus.TIMEOUT

    except (ArticleNotFound, AdapterNotImplemented):
        status = ProcessingStatus.PARSING_ERROR

    response = {
        'status': status.value,
        'url': url,
        'score': score,
        'words_count': words_count
    }
    return response
コード例 #8
0
async def process_article(
    session,
    morph,
    charged_words,
    url,
    processing_outputs,
    sanitizer_func=None
):
    score = None
    word_number = None
    processing_time = None
    try:
        async with timeout(TIMEOUT_SECONDS) as timeout_manager:
            html = await fetch(session, url)
    except (ClientConnectorError, InvalidURL, ClientResponseError):
        logger.warning(f'Can not connect to "{url}"')
        status = ProcessingStatus.FETCH_ERROR
        output = (url, status, score, word_number, processing_time)
        processing_outputs.append(output)
        return
    except asyncio.TimeoutError:
        if not timeout_manager.expired:
            raise
        status = ProcessingStatus.TIMEOUT
        output = (url, status, score, word_number, processing_time)
        processing_outputs.append(output)
        return

    plain_text = html
    if sanitizer_func:
        try:
            plain_text = sanitizer_func(html, plaintext=True)
        except exceptions.ArticleNotFound:
            logger.warning(f'No article found on "{url}"')
            status = ProcessingStatus.PARSING_ERROR
            output = (url, status, score, word_number, processing_time)
            processing_outputs.append(output)
            return

    try:
        async with timeout(TIMEOUT_SECONDS) as timeout_manager:
            article_words = await split_by_words(morph, plain_text)
    except asyncio.TimeoutError:
        if not timeout_manager.expired:
            raise
        logger.debug(f'Timeout exceeded while processing an article on {url}')
        status = ProcessingStatus.TIMEOUT
        processing_time = TIMEOUT_SECONDS
        output = (url, status, score, word_number, processing_time)
        processing_outputs.append(output)
        return

    status = ProcessingStatus.OK
    score = calculate_jaundice_rate(article_words, charged_words)
    word_number = len(article_words)
    processing_time = TIMEOUT_SECONDS - timeout_manager.remaining
    logger.debug(f'{url} has been processed in {processing_time} seconds')
    processing_output = (url, status, score, word_number, processing_time)
    processing_outputs.append(processing_output)
コード例 #9
0
async def process_article(session, morph, charged_words, url):
    result = {
        'title': None,
        'status': None,
        'score': None,
        'words_count': None
    }

    try:
        await check_for_available_parse(url)
    except ArticleNotFound as exc:
        result.update({
            'title': f'{exc}',
            'status': ProcessingStatus.PARSING_ERROR.value
        })
        return result

    try:
        async with timeout(5):
            html = await fetch(url, session)
    except InvalidURL as e:
        result.update({
            'title': f'URL {e} Does not exist',
            'status': ProcessingStatus.FETCH_ERROR.value
        })
        return result
    except (ClientError, ClientConnectorError):
        result.update({
            'title': 'Connection error',
            'status': ProcessingStatus.FETCH_ERROR.value
        })
        return result
    except asyncio.TimeoutError:
        result.update({
            'title': 'TimeOut error',
            'status': ProcessingStatus.TIMEOUT.value
        })
        return result

    sanitazed_text, title = sanitize(html, plaintext=True)
    async with process_split_by_words(
            morph, sanitazed_text) as (splited_text, execution_time, error):
        if error:
            result.update({
                'title': title,
                'status': ProcessingStatus.TIMEOUT.value
            })
            return result
        score = calculate_jaundice_rate(splited_text, charged_words)
        logging.info(f'Анализ статьи произведен за {execution_time:.2f} сек.')
        result.update({
            'title': title,
            'status': ProcessingStatus.OK.value,
            'score': score,
            'words_count': len(splited_text)
        })
    return result
コード例 #10
0
ファイル: main.py プロジェクト: dimk00z/jaundice-rate
async def process_article(session: aiohttp.client.ClientSession,
                          morph: MorphAnalyzer,
                          charged_words: Tuple[str],
                          url: str,
                          sites_ratings: List[Dict],
                          skip_sanitizer: bool = False):

    yellow_rate = None
    words_count = None
    processing_time = None
    article_title = None
    try:
        async with timeout(TIMEOUT):
            html: str = await fetch(session, url)

            article_title: str = extract_title(html)

            domain_name = extract_sanitizer_name(url=url)
            article: str = html if skip_sanitizer else get_sanitizer(
                sanitizer_name=domain_name)(html, plaintext=True)

            with elapsed_timer() as timer:
                article_words: List[str] = await split_by_words(
                    morph=morph, text=article, splitting_timeout=TIMEOUT)
            processing_time = round(timer.duration, 3)

            yellow_rate: float = calculate_jaundice_rate(
                article_words=article_words, charged_words=charged_words)
            words_count = len(article_words)
            status = ProcessingStatus.OK

    except (ClientConnectorError, ClientError, ClientResponseError):
        article_title: str = 'URL not exist'
        status = ProcessingStatus.FETCH_ERROR

    except (ArticleNotFound, SanitizerNotFound):
        status = ProcessingStatus.PARSING_ERROR

    except TimeoutError:
        status = ProcessingStatus.TIMEOUT

    sites_ratings.append({
        'url': url,
        'title': article_title,
        'rate': yellow_rate,
        'words': words_count,
        'status': status,
        'processing_time': processing_time,
    })
コード例 #11
0
async def process_article(session, morph, charged_words, url, articles_rate):
    max_waiting_time = int(os.getenv('MAX_WAITING_TIME', default=3))
    with run_timer():
        with handle_exceptions(articles_rate):
            async with timeout(max_waiting_time):
                html = await fetch(session, url)
                sanitize_func = get_sanitize_func(url)
                article_title, article_text = sanitize_func(html, True)
                article_words = await text_tools.split_by_words(
                    morph, article_text)
                rate = text_tools.calculate_jaundice_rate(
                    article_words, charged_words)
                articles_rate.append({
                    'title': article_title,
                    'status': str(ProcessingStatus.OK),
                    'rate': rate,
                    'count_words': len(article_words)
                })
                logging.info(article_title)
コード例 #12
0
async def process_article(session, morph, charged_words, url, timeouts):
    status = score = words_count = None
    try:
        html = await fetch(session, url, timeouts['fetch'])
        text = SANITIZERS['inosmi_ru'](html, url)
        async with timeout(timeouts['split']):
            words = await split_by_words(morph, text)
        words_count = len(words)
        score = calculate_jaundice_rate(words, charged_words)
        status = ProcessingStatus.OK.name
    except (ClientConnectionError, ClientResponseError, InvalidURL):
        status = ProcessingStatus.CONN_ERROR.name
    except ArticleNotFound as e:
        status = ProcessingStatus.PARSING_ERROR.name
    except asyncio.TimeoutError:
        status = ProcessingStatus.TIMEOUT.name
    return OrderedDict(
        zip(['status', 'url', 'score', 'words_count'],
            [status, url, score, words_count]))
コード例 #13
0
async def process_article(
    session: aiohttp.ClientSession,
    morph: pymorphy2.MorphAnalyzer,
    charged_words: List[str],
    url: str,
    results: List[Dict[str, Union[str, int, float, None]]],
    request_timeout: Union[float, int] = 2,
    process_timeout: Union[float, int] = 3,
    cache: Optional[BaseCache] = None,
) -> None:
    if cache:
        cached_result = await get_from_cache(cache, url)
        if cached_result:
            results.append(cached_result)
            return

    result = {
        'status': None,
        'url': url,
        'score': None,
        'word_count': None,
    }
    try:
        async with async_timeout.timeout(request_timeout):
            html = await fetch(session, url)
        article_text = SANITIZERS['inosmi_ru'](html, plaintext=True)
        with measure_time():
            async with async_timeout.timeout(process_timeout):
                just_words = await split_by_words(morph, article_text)
    except aiohttp.ClientError:
        result['status'] = ProcessingStatus.FETCH_ERROR.value
    except asyncio.TimeoutError:
        result['status'] = ProcessingStatus.TIMEOUT.value
    except ArticleNotFound:
        result['status'] = ProcessingStatus.PARSING_ERROR.value
    else:
        result['status'] = ProcessingStatus.OK.value
        result['score'] = calculate_jaundice_rate(just_words, charged_words)
        result['word_count'] = len(just_words)
        if cache:
            await set_to_cache(cache, url, result)
    results.append(result)
コード例 #14
0
async def process_article(session, morph, charged_words, url, analyze_results, fetch_timeout=TIMEOUT):
    title = 'URL not exist'
    jaundice_rating = words_amount = None
    status = ProcessingStatus.OK

    try:
        async with timeout(fetch_timeout):
            html = await fetch(session, url)

        article_soup = BeautifulSoup(html, 'html.parser')
        title = article_soup.find('title').string

        article_text = sanitize(html, plaintext=True)
        with runtime_measurement():
            splited_text = await split_by_words(morph, article_text)
        words_amount = len(splited_text)

        jaundice_rating = calculate_jaundice_rate(splited_text, charged_words)

    except aiohttp.ClientError:
        status = ProcessingStatus.FETCH_ERROR

    except adapters.ArticleNotFound:
        domain_pattern = r'(^http[s]:\/\/)?(?P<domain>\w+\.\w+)'
        match = re.match(domain_pattern, url)
        title = f'Статья с сайта {match.group("domain")}'
        status = ProcessingStatus.PARSING_ERROR

    except asyncio.TimeoutError:
        status = ProcessingStatus.TIMEOUT

    analyze_result = {
        'title': title,
        'status': status.value,
        'rating': jaundice_rating,
        'words_amount': words_amount
    }

    analyze_results.append(analyze_result)

    return analyze_result
コード例 #15
0
async def process_article(session,
                          article_url,
                          morph,
                          charged_words,
                          max_pending_time_of_fetching_article=3,
                          max_pending_time_of_splitting_by_words=3):
    try:
        async with timeout(max_pending_time_of_fetching_article):
            html = await fetch(session, article_url)

        article_text = get_sanitized_article_text(article_url, html)

        with work_time_counter(
                f'Splitting by words for article {article_url}'):
            async with timeout(max_pending_time_of_splitting_by_words):
                article_words = await split_by_words(morph, article_text)

    except aiohttp.ClientError:
        return get_article_processing_results(
            status=ProcessingStatus.FETCH_ERROR,
            url=article_url,
        )
    except (ArticleNotFound, SanitizerNotImplemented):
        return get_article_processing_results(
            status=ProcessingStatus.PARSING_ERROR,
            url=article_url,
        )
    except asyncio.TimeoutError:
        return get_article_processing_results(status=ProcessingStatus.TIMEOUT,
                                              url=article_url)

    jaundice_rate = calculate_jaundice_rate(article_words, charged_words)

    return get_article_processing_results(
        status=ProcessingStatus.OK,
        url=article_url,
        words_count=len(article_words),
        score=jaundice_rate,
    )
コード例 #16
0
def test_calculate_jaundice_rate():
    assert -0.01 < calculate_jaundice_rate([], []) < 0.01
    rate = calculate_jaundice_rate(
        ["все", "аутсайдер", "побег"], ["аутсайдер", "банкротство"]
    )
    assert 33.0 < rate < 34.0
コード例 #17
0
async def score_text(morph: pymorphy2.MorphAnalyzer, text: str, negative: list):
    words = await text_tools.split_by_words(morph=morph, text=text)
    return text_tools.calculate_jaundice_rate(words, negative)
コード例 #18
0
def test_calculate_jaundice_rate():
    assert -0.01 < calculate_jaundice_rate([], []) < 0.01
    assert 33.0 < calculate_jaundice_rate(['все', 'аутсайдер', 'побег'], ['аутсайдер', 'банкротство']) < 34.0