def use_search_engine(self, morda_url): request = self.search_engine['request'] max_results = self.search_engine.get('max_serp_results', 10) site = self.website.main_page_url self.logger.info('search engine request: {} site:{}'.format(request, site)) serp_urls = list() search_engine = None for search_engine in range(0, SearchEngineEnum.SearchEngineCount): try: serp_urls = SearchEngine.site_search(search_engine, site, request, self.get_selenium_driver()) break except (SerpException, THttpRequester.RobotHttpException, WebDriverException, InvalidSwitchToTargetException) as err: self.logger.error('cannot request search engine, exception: {}'.format(err)) self.logger.debug("sleep 10 seconds and retry other search engine") time.sleep(10) self.get_selenium_driver().restart() time.sleep(5) self.logger.error('retry...') links_count = 0 for url in serp_urls: if not self.website.url_is_not_linked_to_another_project(url): continue link_info = TLinkInfo(TClickEngine.google, morda_url, url, anchor_text=request) link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT self.add_link_wrapper(link_info) links_count += 1 if max_results == 1: break # one link found self.logger.info('found {} links using search engine id={}'.format(links_count, search_engine))
def click_element(self, element, link_info: TLinkInfo): if self.download_folder is not None: make_folder_empty(self.download_folder) assert link_info.target_url is None save_current_url = self.the_driver.current_url # may differ from link_info.SourceUrl, because of redirects ActionChains(self.the_driver)\ .move_to_element(element)\ .pause(1)\ .key_down(Keys.CONTROL) \ .click(element)\ .key_up(Keys.CONTROL) \ .perform() time.sleep(3) if self.download_folder is not None: link_info.downloaded_file = self.wait_download_finished(180) if len(self.the_driver.window_handles) > 1: try: self.the_driver.set_page_load_timeout(2) self.the_driver.switch_to.window( self.the_driver.window_handles[len(self.the_driver.window_handles) - 1]) if self.the_driver.current_url != link_info.source_url and self.the_driver.current_url != 'about:blank': link_info.set_target(self.the_driver.current_url, self.the_driver.title) except Exception as exp: pass finally: self.the_driver.set_page_load_timeout(self.page_load_timeout) self.close_not_first_tab() self.the_driver.switch_to.window(self.the_driver.window_handles[0]) if self.the_driver.current_url != save_current_url: self.logger.error("cannot switch to the saved url must be {}, got {}, keep going".format( save_current_url, self.the_driver.current_url))
def add_link_wrapper(self, link_info: TLinkInfo): assert link_info.target_url is not None try: downloaded_file = TDownloadedFile(link_info.target_url) except THttpRequester.RobotHttpException as err: self.logger.error(err) return href = link_info.target_url self.website.url_nodes[link_info.source_url].add_child_link(href, link_info.to_json()) depth = self.website.url_nodes[link_info.source_url].depth + 1 if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION: html = downloaded_file.convert_html_to_utf8().lower() best_match_count = best_declaration_regex_match(html, from_start=False) if best_match_count > 0: add_weight = best_match_count * TLinkInfo.NORMAL_LINK_WEIGHT self.logger.debug("add weight {} to {} using best_declaration_regex_match".format( add_weight, link_info.weight)) link_info.weight += add_weight if depth < 15: link_info.weight -= 0.1 * depth elif depth < 30: link_info.weight -= 0.5 * depth else: link_info.weight -= 6.0 * depth link_info.weight = max(link_info.weight, self.url_to_weight.get(href, 0.0)) self.url_to_weight[href] = link_info.weight if href not in self.website.url_nodes: if link_info.target_title is None and downloaded_file.file_extension == DEFAULT_HTML_EXTENSION: link_info.target_title = get_html_title(downloaded_file.data) self.website.url_nodes[href] = TUrlInfo(title=link_info.target_title, step_name=self.step_name, depth=depth, parent_node=link_info.source_url) else: self.website.url_nodes[href].add_parent_node(link_info.source_url) self.website.url_nodes[href].update_depth(depth) if self.is_last_step: self.website.export_env.export_file_if_relevant(downloaded_file, link_info) if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION: if self.website.export_env.sha256_is_exported(downloaded_file.get_sha256()): link_info.weight = TLinkInfo.MINIMAL_LINK_WEIGHT self.logger.debug("set weight {} to an html declaration".format(link_info.weight)) if self.transitive: if href not in self.processed_pages: if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION: self.pages_to_process[href] = link_info.weight if href in self.pages_to_process: self.pages_to_process[href] = max(self.pages_to_process[href], link_info.weight) self.logger.debug("add link {} weight={}".format(href, link_info.weight))
def looks_like_a_declaration_link(self, link_info: TLinkInfo): # return looks_like_a_declaration_link_without_cache(self.logger, link_info) if link_info.is_hashable(): result = self.declaration_links_cache.get(link_info.hash_by_target()) if result is not None: return result result = looks_like_a_declaration_link_without_cache(self.logger, link_info) if link_info.is_hashable(): self.declaration_links_cache[link_info.hash_by_target()] = result return result
def check_anticorr_link_text(self, link_info: TLinkInfo): text = link_info.anchor_text.strip().lower() if text.find('антикоррупционная комиссия') != -1: link_info.weight = TLinkInfo.BEST_LINK_WEIGHT return True if text.startswith(u'противодействие') or text.startswith(u'борьба') or text.startswith(u'нет'): if text.find("коррупц") != -1: link_info.weight = TLinkInfo.BEST_LINK_WEIGHT return True text = link_info.anchor_text.strip().lower() if text.find("отчеты") != -1: link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT return True return False
def add_downloaded_file_manually(self, downloaded_file: TDownloadedFile, href=None, declaration_year=None): if href is None: href = self.website.main_page_url link_info = TLinkInfo(TClickEngine.selenium, self.website.main_page_url, href, source_html="", anchor_text="", tag_name="a", element_index=1, downloaded_file=downloaded_file, declaration_year=declaration_year) self.add_downloaded_file_wrapper(link_info)
def gossov_tatarstan_ru(web_site: TWebSiteCrawlSnapshot): web_site.create_export_folder() robot_step = web_site.robot_steps[-1] driver = robot_step.get_selenium_driver() elements = driver.navigate_and_get_links_js("https://gossov.tatarstan.ru/structure/deputaty") for element in elements: person_href = element['href'] if person_href is not None and person_href.find('person_id') != -1: robot_step.add_link_wrapper(TLinkInfo(TClickEngine.manual, robot_step.website.main_page_url, person_href)) file = TDownloadedFile(person_href) parser = THtmlParser(file.data, url=person_href) for html_link in THtmlParser(file.data, url=person_href).soup.findAll("a"): href_pdf = html_link.attrs.get('href', '') if href_pdf.find('revenue') != -1: href_pdf = parser.make_link_soup(href_pdf) robot_step.add_link_wrapper (TLinkInfo(TClickEngine.manual, person_href, href_pdf))
def check_follow(self, src, trg, canon): if not src.startswith('http'): src = 'http://' + src if not trg.startswith('http'): trg = 'http://' + trg link_info = TLinkInfo(TClickEngine.selenium, src, trg) can_follow = self.robot_step.can_follow_this_link(link_info) self.assertEqual(canon, can_follow, msg="{} -> {}".format(src, trg))
def add_links_from_sitemap_xml(self): if self.sitemap_xml_processor is None: return assert self.website.main_page_url in self.website.url_nodes root_page = self.website.main_page_url.strip('/') tree = sitemap_tree_for_homepage(root_page) cnt = 0 useful = 0 for page in tree.all_pages(): cnt += 1 weight = self.sitemap_xml_processor(page.url) if weight > TLinkInfo.MINIMAL_LINK_WEIGHT: if page.url not in self.pages_to_process: useful += 1 link_info = TLinkInfo(TClickEngine.sitemap_xml, self.website.main_page_url, page.url, anchor_text="") link_info.weight = weight self.add_link_wrapper(link_info) self.logger.info("processed {} links from {}/sitemap.xml found {} useful links".format(cnt, root_page, useful))
def test_download_doc(self): shutil.rmtree(TDownloadEnv.get_download_folder(), ignore_errors=True) elements = self.get_all_link_elements('http://aot.ru/doc_examples/test.html') url_and_elements = self.check_anchor(elements, "test.doc") url, element = list(url_and_elements)[0] link_info = TLinkInfo(TClickEngine.selenium, url, None) self.driver_holder.click_element(element, link_info) self.driver_holder.wait_download_finished() download_files = os.listdir(TDownloadEnv.get_download_folder()) self.assertTrue(len(download_files), 1)
def click(driver_holder, url, element_index): elements = driver_holder.navigate_and_get_links_js(url) element = elements[element_index]['id'] print("click element {} anchor={}".format(element_index, element.text)) link_info = TLinkInfo(TClickEngine.selenium, url, None, anchor_text=element.text) driver_holder.click_element(element, link_info) print("href={}".format(link_info.target_url)) print("downloaded_file={}".format(link_info.downloaded_file))
def build_link_info(self, main_url, page_html, element_index, element, html_title): link_text = element['anchor'].strip('\n\r\t ') if element['anchor'] is not None else "" return TLinkInfo(TClickEngine.selenium, source_url=main_url, target_url=element['href'], source_html=page_html, anchor_text=link_text, tag_name=element['id'].tag_name, element_index=element_index, element_class=[element.get('class')], source_page_title=html_title)
def click_selenium_if_no_href(self, main_url, element, element_index, check_link_func): tag_name = element.tag_name link_text = element.text.strip('\n\r\t ') # initialize here, can be broken after click page_html = self.get_selenium_driver().the_driver.page_source THttpRequester.consider_request_policy(main_url + " elem_index=" + str(element_index), "click_selenium") link_info = TLinkInfo(TClickEngine.selenium, main_url, None, source_html=page_html, anchor_text=link_text, tag_name=tag_name, element_index=element_index, source_page_title=self.get_selenium_driver().the_driver.title) self.get_selenium_driver().click_element(element, link_info) if self.normalize_and_check_link(link_info, check_link_func): if link_info.downloaded_file is not None: self.add_downloaded_file_wrapper(link_info) elif link_info.target_url is not None: self.add_link_wrapper(link_info)
def test_mid_video(self): THttpRequester.ENABLE_HEAD_REQUESTS = True link_info = TLinkInfo(TClickEngine.selenium, source_url='https://www.mid.ru/ru/brifingi/-/asset_publisher/MCZ7HQuMdqBY/content/id/4781270#12', target_url='https://www.mid.ru/documents/10180/4780294/210610%281%29.mp4/8acd221f-cb28-4522-a251-5437b160672e' ) logger = self.logger class TDummyProject: def __init__(self): self.config = TRobotConfig.read_by_config_type("prod") class TDummyOffice: def __init__(self): self.logger = logger self.parent_project = TDummyProject() step_info = TRobotStep(TDummyOffice()) res = step_info.normalize_and_check_link(link_info, TRobotStep.looks_like_a_declaration_link) self.assertFalse(res)
def filter_link_elements_by_anchor(self, link_elements, start_anchor_text): urls_and_elements = set() for element_index, element in enumerate(link_elements): try: if element['anchor'] is None: continue link_text = element['anchor'].strip('\n\r\t ') self.logger.debug("check link anchor={}, element_index={}".format(link_text, element_index)) if link_text.lower().startswith(start_anchor_text.lower()): self.logger.debug("found link anchor={}".format(link_text)) href = element['href'] if href is None: link_info = TLinkInfo(TClickEngine.selenium, driver_holder.the_driver.current_url, None) self.driver_holder.click_element(element['id'], link_info) href = self.driver_holder.the_driver.current_url self.driver_holder.the_driver.back() urls_and_elements.add((href, element['id'])) except Exception as exp: self.logger.error(exp) return urls_and_elements
def looks_like_a_declaration_link_without_cache(logger, link_info: TLinkInfo): # here is a place for ML anchor_text_russified = normalize_and_russify_anchor_text( link_info.anchor_text) page_html = normalize_and_russify_anchor_text(link_info.page_html) positive_case = None anchor_best_match = False if best_declaration_regex_match(anchor_text_russified): anchor_best_match = True positive_case = "case 0" elif has_negative_words(anchor_text_russified): return False if link_info.target_url is not None: # we make a http-head request here, that is rather slow file_extension = get_file_extension_only_by_headers( link_info.target_url) if is_video_or_audio_file_extension(file_extension): logger.debug("link {} looks like a media file, skipped".format( link_info.target_url)) return False income_regexp = '(доход((ах)|(е)))|(коррупц)' sved_regexp = '(сведения)|(справк[аи])|(sveden)' svedenija_anchor = re.search(sved_regexp, anchor_text_russified) is not None or \ re.search(sved_regexp, link_info.anchor_text, re.IGNORECASE) is not None year_anchor = re.search('\\b20[0-9][0-9]\\b', anchor_text_russified) is not None income_page = re.search(income_regexp, page_html) is not None source_page_title_has_income_word = re.search( income_regexp, link_info.source_page_title) is not None income_anchor = re.search(income_regexp, anchor_text_russified) is not None role_anchor = is_public_servant_role(anchor_text_russified) office_word = has_office_word_in_beginning(anchor_text_russified) geo_leaf_word = has_geo_leaf_word_in_beginning(anchor_text_russified) document_url = None sub_page = check_sub_page_or_iframe(logger, link_info) income_url, svedenija_url, corrupt_url = url_features(link_info.target_url) if link_info.element_class is not None: if isinstance(link_info.element_class, list): for css_class_name in link_info.element_class: if re.search(INCOME_URL_REGEXP, css_class_name, re.IGNORECASE): income_url = True if positive_case is None: if income_page or income_url: if svedenija_anchor or year_anchor or sub_page: positive_case = "case 1" else: if document_url is None: document_url = looks_like_a_document_link( logger, link_info ) #lazy calculaiton since it has a time-consuming head http-request if document_url: positive_case = "case 1" # http://arshush.ru/index.php?option=com_content&task=blogcategory&id=62&Itemid=72 # "Сведения за 2018 год" - no topic word if positive_case is None: if svedenija_anchor or svedenija_url: if year_anchor: positive_case = "case 2" else: if document_url is None: document_url = looks_like_a_document_link( logger, link_info) if document_url: positive_case = "case 2" if positive_case is None: if (income_page or income_url) and role_anchor: positive_case = "case 3" if positive_case is None: if source_page_title_has_income_word and income_url: positive_case = "case 4" if positive_case is None: if office_word: positive_case = "case 5" if positive_case is None: if geo_leaf_word: positive_case = "case 6" if positive_case is None: #very special case for sudrf.ru (cannot use domain name here because of unittests) #may be it should be revised #http://oblsud.ros.sudrf.ru/modules.php?name=anticorruption&rid=6 if link_info.target_url is not None and link_info.target_url.find('name=anticorruption') != -1 and \ anchor_text_russified is not None and anchor_text_russified.lower().strip().endswith('архив'): positive_case = "case 7" anchor_best_match = True if positive_case is not None: weight = TLinkInfo.MINIMAL_LINK_WEIGHT if anchor_best_match: weight += TLinkInfo.BEST_LINK_WEIGHT if income_anchor: weight += TLinkInfo.BEST_LINK_WEIGHT if income_url: weight += TLinkInfo.BEST_LINK_WEIGHT if svedenija_anchor: weight += TLinkInfo.NORMAL_LINK_WEIGHT if svedenija_url: weight += TLinkInfo.NORMAL_LINK_WEIGHT if year_anchor: weight += TLinkInfo.TRASH_LINK_WEIGHT # better than sub_page if income_page and weight > 0: weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING if corrupt_url and weight > 0: weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING if office_word: weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING if geo_leaf_word: weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING all_features = (("income_page", income_page), ("income_url", income_url), ('income_anchor', income_anchor), ('svedenija_anchor', svedenija_anchor), ('svedenija_url', svedenija_url), ("document_url", document_url), ("sub_page", sub_page), ("year_anchor", year_anchor), ("corrupt_url", corrupt_url), ('role_anchor', role_anchor), ('anchor_best_match', anchor_best_match), ('office_word', office_word), ('geo_leaf_word', geo_leaf_word)) all_features_str = ";".join(k for k, v in all_features if v) logger.debug("{}, weight={}, features: {}".format( positive_case, weight, all_features_str)) link_info.weight = weight return True return False
def add_regional_main_pages(self): for url in self.website.get_regional_pages(): link_info = TLinkInfo(TClickEngine.manual, self.website.main_page_url, url) link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT self.add_link_wrapper(link_info)