def it_logging_warning_if_broken_file(xml_sitemap_bad, site_url, caplog): res = tm._get_urls(xml_sitemap_bad, site_url, tm.get_file_type(xml_sitemap_bad)) print('caplog.text()', caplog.text()) assert 'sitemap.get_urls' in caplog.text() assert res == ([])
def scan_sp(next_step=False, max_limit=0): logging.info('solo crawler start') keywords, all_robots = _init_crawler() pages = database.get_pages_rows(None) # TODO: добавить проверку если len(pages) = 0 # то найти наименьшую дату и выбрать по ней. add_urls_total = 0 for page in pages: page_id, url, site_id, base_url = page request_time = time.time() logging.info('#BEGIN %s url %s, base_url %s', page_id, url, base_url) content = _get_content(url) robots = all_robots.get(site_id) if add_urls_total >= max_limit: page_type = sitemap.get_file_type(content) add_urls_count = 0 else: new_pages_data, page_id, page_type = sitemap.scan_urls( content, page, robots) if len(new_pages_data) > max_limit: new_pages_data = new_pages_data[:max_limit + 1] add_urls_count, page_id = database.add_urls( new_pages_data, page_id) if page_type != sitemap.SM_TYPE_HTML: database.update_last_scan_date(page_id) if page_type == sitemap.SM_TYPE_HTML: ranks, page_id = parsers.process_ranks( content, page_id, keywords, ) database.update_person_page_rank(page_id, ranks) request_time = time.time() - request_time logging.info('#END url %s, base_url %s, add urls %s, time %s', url, base_url, add_urls_count, request_time) add_urls_total = add_urls_total + add_urls_count logging.info('Crawler.scan: Add %s new urls on date %s', add_urls_total, 'NULL') return add_urls_total
def parse_html(page_content, words_dict): """ page_content - контент страницы words_dict - {"person_id":[words_list]} возвращает словарь {"person_id": "rank"} """ if get_file_type(page_content) != SM_TYPE_HTML: return {} min_len = 3 # минимальная длина слова которе считается словом result = {} logging.debug('parse_html: %s', words_dict) try: html_text = _extract_text(page_content) words_list = _split_text(html_text, min_len) result = _count_words(words_list, words_dict) except Exception as ex: logging.error("parsers.parse_html: error %s", ex) logging.debug('parse_html %s completed...', result) return result
def it_return_tuple_of_urls_rec(rec_sitemap, site_url, rec_list): assert tm._get_urls(rec_sitemap, site_url, tm.get_file_type(rec_sitemap)) == (rec_list)
def it_return_tuple_of_urls_txt(txt_sitemap, site_url, urls_list): assert tm._get_urls(txt_sitemap, site_url, tm.get_file_type(txt_sitemap)) == (urls_list)
def it_return_tuple_of_urls_html(html_sitemap, site_url, urls_list): assert tm._get_urls(html_sitemap, site_url, tm.get_file_type(html_sitemap)) == (urls_list)
def it_return_type_of_sitemap_txt(txt_sitemap): assert tm.get_file_type(txt_sitemap) == tm.SM_TYPE_TXT
def it_return_type_of_sitemap_html(html_sitemap): assert tm.get_file_type(html_sitemap) == tm.SM_TYPE_HTML
def it_return_type_of_sitemap_xml(xml_sitemap): assert tm.get_file_type(xml_sitemap) == tm.SM_TYPE_XML