async def process_article(session, morph, charged_words, url, fetch_timeout=1.5, split_timeout=3): score = None words_count = None try: async with timeout(fetch_timeout): html = await fetch(session, url) sanitized_html = sanitize(html, plaintext=True) with count_time(): async with timeout(split_timeout): article_words = await split_by_words(morph, sanitized_html) score = calculate_jaundice_rate(article_words, charged_words) words_count = len(article_words) status = ProcessingStatus.OK except ClientConnectionError: status = ProcessingStatus.FETCH_ERROR except ArticleNotFound: status = ProcessingStatus.PARSING_ERROR except asyncio.TimeoutError: status = ProcessingStatus.TIMEOUT return (status, url, score, words_count)
async def process_article(url, articles_data, processed_max_time=3): article_info = { 'status': None, 'url': url, 'words_count': None, 'score': None, } async with aiohttp.ClientSession() as session: with managed_time_processs() as timer_process: try: async with timeout(processed_max_time) as cm: html = await fetch(session, url) sanitized_html = sanitize(html) article_words = await text_tools.split_by_words( morph, sanitized_html) charged_words = fetch_charged_words(CHARGED_WORDS_FILE) article_info['status'] = ProcessingStatus.OK.value article_info['words_count'] = len(article_words) article_info['score'] = text_tools.calculate_jaundice_rate( article_words, charged_words) except adapters.ArticleNotFound: article_info['status'] = ProcessingStatus.PARSING_ERROR.value except asyncio.TimeoutError: article_info['status'] = ProcessingStatus.TIMEOUT.value except aiohttp.ClientResponseError: article_info['status'] = ProcessingStatus.FETCH_ERROR.value articles_data.append(article_info)
async def process_article(session, morph, charged_words, url): try: html = await fetch(session, url) link_fetched = True status = ProcessingStatus.OK clean_plaintext = sanitize(html, plaintext=True) except (SocksError, aiohttp.ClientError): link_fetched = False status = ProcessingStatus.FETCH_ERROR except ArticleNotFound: link_fetched = False status = ProcessingStatus.PARSING_ERROR except asyncio.TimeoutError: link_fetched = False status = ProcessingStatus.TIMEOUT if link_fetched: try: async with async_timeout.timeout(3): article_words = await split_by_words(morph, clean_plaintext) words_count = len(article_words) score = calculate_jaundice_rate(article_words, charged_words) except asyncio.TimeoutError: status = ProcessingStatus.TIMEOUT words_count = score = None else: words_count = score = None return url, status, score, words_count
async def process_article(article, morph, session, charged_words, fetch_timeout=3, process_timeout=3): try: async with timeout(fetch_timeout): html = await fetch(session, article) clean_text = inosmi_ru.sanitize(html, True) with work_timer(): async with timeout(process_timeout): words = await text_tools.split_by_words(morph, clean_text) rate = text_tools.calculate_jaundice_rate(words, charged_words) status = ProcessingStatus.OK.value return process_result(status, article, rate, len(words)) except inosmi_ru.ArticleNotFound as e: status = ProcessingStatus.PARSING_ERROR.value return process_result(status, article) except (aiohttp.ClientConnectorError, aiohttp.ClientResponseError) as e: status = ProcessingStatus.FETCH_ERROR.value return process_result(status, article) except asyncio.TimeoutError as e: status = ProcessingStatus.TIMEOUT.value return process_result(status, article)
async def process_article(session, morph, charged_words, url, results, max_timeout=ASYNC_TIMEOUT): """Анализ статьи на 'желтушность.""" async with measure_time(): score = None words_count = None try: async with timeout(max_timeout): html = await fetch(session, url) status = ProcessingStatus.OK article = inosmi_ru.sanitize(html) async with timeout(max_timeout): words = await split_by_words(morph, article) score = calculate_jaundice_rate(words, charged_words) words_count = len(words) except (ClientError, InvalidURL): status = ProcessingStatus.FETCH_ERROR except exceptions.ArticleNotFound: status = ProcessingStatus.PARSING_ERROR except asyncio.exceptions.TimeoutError: status = ProcessingStatus.TIMEOUT results.append({ 'status': status.value, 'url': url, 'rating': score, 'words_count': words_count })
async def process_article(session: aiohttp.ClientSession, morph: pymorphy2.MorphAnalyzer, charged_words: list, url: str, results: list, timeout: int = 10) -> dict: try: async with async_timeout(timeout): html = await fetch(session, url) article = sanitize(html) with measure_time(): words = await split_by_words(morph, article) rating = calculate_jaundice_rate(words, charged_words) words_count = len(words) status = ProcessingStatus.OK except aiohttp.ClientError: rating = None words_count = None status = ProcessingStatus.FETCH_ERROR except ArticleNotFound: rating = None words_count = None status = ProcessingStatus.PARSING_ERROR except asyncio.TimeoutError: rating = None words_count = None status = ProcessingStatus.TIMEOUT_ERROR results.append({ 'rating': rating, 'words_count': words_count, 'status': status.value, 'url': url })
async def process_article(session: aiohttp.ClientSession, morph: pymorphy2.MorphAnalyzer, charged_words: List[str], url: str, timeout=PROCESSING_TIMEOUT) -> dict: score, words_count = None, None try: async with async_timeout(RESPONSE_TIMEOUT): html = await fetch(session, url) text = sanitize_html(html, url) async with timing_manager(timeout): words = await split_by_words(morph, text) score = calculate_jaundice_rate(words, charged_words) words_count = len(words) status = ProcessingStatus.OK except aiohttp.ClientError: status = ProcessingStatus.FETCH_ERROR except asyncio.TimeoutError: status = ProcessingStatus.TIMEOUT except (ArticleNotFound, AdapterNotImplemented): status = ProcessingStatus.PARSING_ERROR response = { 'status': status.value, 'url': url, 'score': score, 'words_count': words_count } return response
async def process_article( session, morph, charged_words, url, processing_outputs, sanitizer_func=None ): score = None word_number = None processing_time = None try: async with timeout(TIMEOUT_SECONDS) as timeout_manager: html = await fetch(session, url) except (ClientConnectorError, InvalidURL, ClientResponseError): logger.warning(f'Can not connect to "{url}"') status = ProcessingStatus.FETCH_ERROR output = (url, status, score, word_number, processing_time) processing_outputs.append(output) return except asyncio.TimeoutError: if not timeout_manager.expired: raise status = ProcessingStatus.TIMEOUT output = (url, status, score, word_number, processing_time) processing_outputs.append(output) return plain_text = html if sanitizer_func: try: plain_text = sanitizer_func(html, plaintext=True) except exceptions.ArticleNotFound: logger.warning(f'No article found on "{url}"') status = ProcessingStatus.PARSING_ERROR output = (url, status, score, word_number, processing_time) processing_outputs.append(output) return try: async with timeout(TIMEOUT_SECONDS) as timeout_manager: article_words = await split_by_words(morph, plain_text) except asyncio.TimeoutError: if not timeout_manager.expired: raise logger.debug(f'Timeout exceeded while processing an article on {url}') status = ProcessingStatus.TIMEOUT processing_time = TIMEOUT_SECONDS output = (url, status, score, word_number, processing_time) processing_outputs.append(output) return status = ProcessingStatus.OK score = calculate_jaundice_rate(article_words, charged_words) word_number = len(article_words) processing_time = TIMEOUT_SECONDS - timeout_manager.remaining logger.debug(f'{url} has been processed in {processing_time} seconds') processing_output = (url, status, score, word_number, processing_time) processing_outputs.append(processing_output)
async def process_article(session, morph, charged_words, url): result = { 'title': None, 'status': None, 'score': None, 'words_count': None } try: await check_for_available_parse(url) except ArticleNotFound as exc: result.update({ 'title': f'{exc}', 'status': ProcessingStatus.PARSING_ERROR.value }) return result try: async with timeout(5): html = await fetch(url, session) except InvalidURL as e: result.update({ 'title': f'URL {e} Does not exist', 'status': ProcessingStatus.FETCH_ERROR.value }) return result except (ClientError, ClientConnectorError): result.update({ 'title': 'Connection error', 'status': ProcessingStatus.FETCH_ERROR.value }) return result except asyncio.TimeoutError: result.update({ 'title': 'TimeOut error', 'status': ProcessingStatus.TIMEOUT.value }) return result sanitazed_text, title = sanitize(html, plaintext=True) async with process_split_by_words( morph, sanitazed_text) as (splited_text, execution_time, error): if error: result.update({ 'title': title, 'status': ProcessingStatus.TIMEOUT.value }) return result score = calculate_jaundice_rate(splited_text, charged_words) logging.info(f'Анализ статьи произведен за {execution_time:.2f} сек.') result.update({ 'title': title, 'status': ProcessingStatus.OK.value, 'score': score, 'words_count': len(splited_text) }) return result
async def process_article(session: aiohttp.client.ClientSession, morph: MorphAnalyzer, charged_words: Tuple[str], url: str, sites_ratings: List[Dict], skip_sanitizer: bool = False): yellow_rate = None words_count = None processing_time = None article_title = None try: async with timeout(TIMEOUT): html: str = await fetch(session, url) article_title: str = extract_title(html) domain_name = extract_sanitizer_name(url=url) article: str = html if skip_sanitizer else get_sanitizer( sanitizer_name=domain_name)(html, plaintext=True) with elapsed_timer() as timer: article_words: List[str] = await split_by_words( morph=morph, text=article, splitting_timeout=TIMEOUT) processing_time = round(timer.duration, 3) yellow_rate: float = calculate_jaundice_rate( article_words=article_words, charged_words=charged_words) words_count = len(article_words) status = ProcessingStatus.OK except (ClientConnectorError, ClientError, ClientResponseError): article_title: str = 'URL not exist' status = ProcessingStatus.FETCH_ERROR except (ArticleNotFound, SanitizerNotFound): status = ProcessingStatus.PARSING_ERROR except TimeoutError: status = ProcessingStatus.TIMEOUT sites_ratings.append({ 'url': url, 'title': article_title, 'rate': yellow_rate, 'words': words_count, 'status': status, 'processing_time': processing_time, })
async def process_article(session, morph, charged_words, url, articles_rate): max_waiting_time = int(os.getenv('MAX_WAITING_TIME', default=3)) with run_timer(): with handle_exceptions(articles_rate): async with timeout(max_waiting_time): html = await fetch(session, url) sanitize_func = get_sanitize_func(url) article_title, article_text = sanitize_func(html, True) article_words = await text_tools.split_by_words( morph, article_text) rate = text_tools.calculate_jaundice_rate( article_words, charged_words) articles_rate.append({ 'title': article_title, 'status': str(ProcessingStatus.OK), 'rate': rate, 'count_words': len(article_words) }) logging.info(article_title)
async def process_article(session, morph, charged_words, url, timeouts): status = score = words_count = None try: html = await fetch(session, url, timeouts['fetch']) text = SANITIZERS['inosmi_ru'](html, url) async with timeout(timeouts['split']): words = await split_by_words(morph, text) words_count = len(words) score = calculate_jaundice_rate(words, charged_words) status = ProcessingStatus.OK.name except (ClientConnectionError, ClientResponseError, InvalidURL): status = ProcessingStatus.CONN_ERROR.name except ArticleNotFound as e: status = ProcessingStatus.PARSING_ERROR.name except asyncio.TimeoutError: status = ProcessingStatus.TIMEOUT.name return OrderedDict( zip(['status', 'url', 'score', 'words_count'], [status, url, score, words_count]))
async def process_article( session: aiohttp.ClientSession, morph: pymorphy2.MorphAnalyzer, charged_words: List[str], url: str, results: List[Dict[str, Union[str, int, float, None]]], request_timeout: Union[float, int] = 2, process_timeout: Union[float, int] = 3, cache: Optional[BaseCache] = None, ) -> None: if cache: cached_result = await get_from_cache(cache, url) if cached_result: results.append(cached_result) return result = { 'status': None, 'url': url, 'score': None, 'word_count': None, } try: async with async_timeout.timeout(request_timeout): html = await fetch(session, url) article_text = SANITIZERS['inosmi_ru'](html, plaintext=True) with measure_time(): async with async_timeout.timeout(process_timeout): just_words = await split_by_words(morph, article_text) except aiohttp.ClientError: result['status'] = ProcessingStatus.FETCH_ERROR.value except asyncio.TimeoutError: result['status'] = ProcessingStatus.TIMEOUT.value except ArticleNotFound: result['status'] = ProcessingStatus.PARSING_ERROR.value else: result['status'] = ProcessingStatus.OK.value result['score'] = calculate_jaundice_rate(just_words, charged_words) result['word_count'] = len(just_words) if cache: await set_to_cache(cache, url, result) results.append(result)
async def process_article(session, morph, charged_words, url, analyze_results, fetch_timeout=TIMEOUT): title = 'URL not exist' jaundice_rating = words_amount = None status = ProcessingStatus.OK try: async with timeout(fetch_timeout): html = await fetch(session, url) article_soup = BeautifulSoup(html, 'html.parser') title = article_soup.find('title').string article_text = sanitize(html, plaintext=True) with runtime_measurement(): splited_text = await split_by_words(morph, article_text) words_amount = len(splited_text) jaundice_rating = calculate_jaundice_rate(splited_text, charged_words) except aiohttp.ClientError: status = ProcessingStatus.FETCH_ERROR except adapters.ArticleNotFound: domain_pattern = r'(^http[s]:\/\/)?(?P<domain>\w+\.\w+)' match = re.match(domain_pattern, url) title = f'Статья с сайта {match.group("domain")}' status = ProcessingStatus.PARSING_ERROR except asyncio.TimeoutError: status = ProcessingStatus.TIMEOUT analyze_result = { 'title': title, 'status': status.value, 'rating': jaundice_rating, 'words_amount': words_amount } analyze_results.append(analyze_result) return analyze_result
async def process_article(session, article_url, morph, charged_words, max_pending_time_of_fetching_article=3, max_pending_time_of_splitting_by_words=3): try: async with timeout(max_pending_time_of_fetching_article): html = await fetch(session, article_url) article_text = get_sanitized_article_text(article_url, html) with work_time_counter( f'Splitting by words for article {article_url}'): async with timeout(max_pending_time_of_splitting_by_words): article_words = await split_by_words(morph, article_text) except aiohttp.ClientError: return get_article_processing_results( status=ProcessingStatus.FETCH_ERROR, url=article_url, ) except (ArticleNotFound, SanitizerNotImplemented): return get_article_processing_results( status=ProcessingStatus.PARSING_ERROR, url=article_url, ) except asyncio.TimeoutError: return get_article_processing_results(status=ProcessingStatus.TIMEOUT, url=article_url) jaundice_rate = calculate_jaundice_rate(article_words, charged_words) return get_article_processing_results( status=ProcessingStatus.OK, url=article_url, words_count=len(article_words), score=jaundice_rate, )
def test_calculate_jaundice_rate(): assert -0.01 < calculate_jaundice_rate([], []) < 0.01 rate = calculate_jaundice_rate( ["все", "аутсайдер", "побег"], ["аутсайдер", "банкротство"] ) assert 33.0 < rate < 34.0
async def score_text(morph: pymorphy2.MorphAnalyzer, text: str, negative: list): words = await text_tools.split_by_words(morph=morph, text=text) return text_tools.calculate_jaundice_rate(words, negative)
def test_calculate_jaundice_rate(): assert -0.01 < calculate_jaundice_rate([], []) < 0.01 assert 33.0 < calculate_jaundice_rate(['все', 'аутсайдер', 'побег'], ['аутсайдер', 'банкротство']) < 34.0