def add_link_wrapper(self, link_info: TLinkInfo): assert link_info.target_url is not None try: downloaded_file = TDownloadedFile(link_info.target_url) except THttpRequester.RobotHttpException as err: self.logger.error(err) return href = link_info.target_url self.website.url_nodes[link_info.source_url].add_child_link(href, link_info.to_json()) depth = self.website.url_nodes[link_info.source_url].depth + 1 if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION: html = downloaded_file.convert_html_to_utf8().lower() best_match_count = best_declaration_regex_match(html, from_start=False) if best_match_count > 0: add_weight = best_match_count * TLinkInfo.NORMAL_LINK_WEIGHT self.logger.debug("add weight {} to {} using best_declaration_regex_match".format( add_weight, link_info.weight)) link_info.weight += add_weight if depth < 15: link_info.weight -= 0.1 * depth elif depth < 30: link_info.weight -= 0.5 * depth else: link_info.weight -= 6.0 * depth link_info.weight = max(link_info.weight, self.url_to_weight.get(href, 0.0)) self.url_to_weight[href] = link_info.weight if href not in self.website.url_nodes: if link_info.target_title is None and downloaded_file.file_extension == DEFAULT_HTML_EXTENSION: link_info.target_title = get_html_title(downloaded_file.data) self.website.url_nodes[href] = TUrlInfo(title=link_info.target_title, step_name=self.step_name, depth=depth, parent_node=link_info.source_url) else: self.website.url_nodes[href].add_parent_node(link_info.source_url) self.website.url_nodes[href].update_depth(depth) if self.is_last_step: self.website.export_env.export_file_if_relevant(downloaded_file, link_info) if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION: if self.website.export_env.sha256_is_exported(downloaded_file.get_sha256()): link_info.weight = TLinkInfo.MINIMAL_LINK_WEIGHT self.logger.debug("set weight {} to an html declaration".format(link_info.weight)) if self.transitive: if href not in self.processed_pages: if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION: self.pages_to_process[href] = link_info.weight if href in self.pages_to_process: self.pages_to_process[href] = max(self.pages_to_process[href], link_info.weight) self.logger.debug("add link {} weight={}".format(href, link_info.weight))
def use_search_engine(self, morda_url): request = self.search_engine['request'] max_results = self.search_engine.get('max_serp_results', 10) site = self.website.main_page_url self.logger.info('search engine request: {} site:{}'.format(request, site)) serp_urls = list() search_engine = None for search_engine in range(0, SearchEngineEnum.SearchEngineCount): try: serp_urls = SearchEngine.site_search(search_engine, site, request, self.get_selenium_driver()) break except (SerpException, THttpRequester.RobotHttpException, WebDriverException, InvalidSwitchToTargetException) as err: self.logger.error('cannot request search engine, exception: {}'.format(err)) self.logger.debug("sleep 10 seconds and retry other search engine") time.sleep(10) self.get_selenium_driver().restart() time.sleep(5) self.logger.error('retry...') links_count = 0 for url in serp_urls: if not self.website.url_is_not_linked_to_another_project(url): continue link_info = TLinkInfo(TClickEngine.google, morda_url, url, anchor_text=request) link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT self.add_link_wrapper(link_info) links_count += 1 if max_results == 1: break # one link found self.logger.info('found {} links using search engine id={}'.format(links_count, search_engine))
def check_anticorr_link_text(self, link_info: TLinkInfo): text = link_info.anchor_text.strip().lower() if text.find('антикоррупционная комиссия') != -1: link_info.weight = TLinkInfo.BEST_LINK_WEIGHT return True if text.startswith(u'противодействие') or text.startswith(u'борьба') or text.startswith(u'нет'): if text.find("коррупц") != -1: link_info.weight = TLinkInfo.BEST_LINK_WEIGHT return True text = link_info.anchor_text.strip().lower() if text.find("отчеты") != -1: link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT return True return False
def add_links_from_sitemap_xml(self): if self.sitemap_xml_processor is None: return assert self.website.main_page_url in self.website.url_nodes root_page = self.website.main_page_url.strip('/') tree = sitemap_tree_for_homepage(root_page) cnt = 0 useful = 0 for page in tree.all_pages(): cnt += 1 weight = self.sitemap_xml_processor(page.url) if weight > TLinkInfo.MINIMAL_LINK_WEIGHT: if page.url not in self.pages_to_process: useful += 1 link_info = TLinkInfo(TClickEngine.sitemap_xml, self.website.main_page_url, page.url, anchor_text="") link_info.weight = weight self.add_link_wrapper(link_info) self.logger.info("processed {} links from {}/sitemap.xml found {} useful links".format(cnt, root_page, useful))
def looks_like_a_declaration_link_without_cache(logger, link_info: TLinkInfo): # here is a place for ML anchor_text_russified = normalize_and_russify_anchor_text( link_info.anchor_text) page_html = normalize_and_russify_anchor_text(link_info.page_html) positive_case = None anchor_best_match = False if best_declaration_regex_match(anchor_text_russified): anchor_best_match = True positive_case = "case 0" elif has_negative_words(anchor_text_russified): return False if link_info.target_url is not None: # we make a http-head request here, that is rather slow file_extension = get_file_extension_only_by_headers( link_info.target_url) if is_video_or_audio_file_extension(file_extension): logger.debug("link {} looks like a media file, skipped".format( link_info.target_url)) return False income_regexp = '(доход((ах)|(е)))|(коррупц)' sved_regexp = '(сведения)|(справк[аи])|(sveden)' svedenija_anchor = re.search(sved_regexp, anchor_text_russified) is not None or \ re.search(sved_regexp, link_info.anchor_text, re.IGNORECASE) is not None year_anchor = re.search('\\b20[0-9][0-9]\\b', anchor_text_russified) is not None income_page = re.search(income_regexp, page_html) is not None source_page_title_has_income_word = re.search( income_regexp, link_info.source_page_title) is not None income_anchor = re.search(income_regexp, anchor_text_russified) is not None role_anchor = is_public_servant_role(anchor_text_russified) office_word = has_office_word_in_beginning(anchor_text_russified) geo_leaf_word = has_geo_leaf_word_in_beginning(anchor_text_russified) document_url = None sub_page = check_sub_page_or_iframe(logger, link_info) income_url, svedenija_url, corrupt_url = url_features(link_info.target_url) if link_info.element_class is not None: if isinstance(link_info.element_class, list): for css_class_name in link_info.element_class: if re.search(INCOME_URL_REGEXP, css_class_name, re.IGNORECASE): income_url = True if positive_case is None: if income_page or income_url: if svedenija_anchor or year_anchor or sub_page: positive_case = "case 1" else: if document_url is None: document_url = looks_like_a_document_link( logger, link_info ) #lazy calculaiton since it has a time-consuming head http-request if document_url: positive_case = "case 1" # http://arshush.ru/index.php?option=com_content&task=blogcategory&id=62&Itemid=72 # "Сведения за 2018 год" - no topic word if positive_case is None: if svedenija_anchor or svedenija_url: if year_anchor: positive_case = "case 2" else: if document_url is None: document_url = looks_like_a_document_link( logger, link_info) if document_url: positive_case = "case 2" if positive_case is None: if (income_page or income_url) and role_anchor: positive_case = "case 3" if positive_case is None: if source_page_title_has_income_word and income_url: positive_case = "case 4" if positive_case is None: if office_word: positive_case = "case 5" if positive_case is None: if geo_leaf_word: positive_case = "case 6" if positive_case is None: #very special case for sudrf.ru (cannot use domain name here because of unittests) #may be it should be revised #http://oblsud.ros.sudrf.ru/modules.php?name=anticorruption&rid=6 if link_info.target_url is not None and link_info.target_url.find('name=anticorruption') != -1 and \ anchor_text_russified is not None and anchor_text_russified.lower().strip().endswith('архив'): positive_case = "case 7" anchor_best_match = True if positive_case is not None: weight = TLinkInfo.MINIMAL_LINK_WEIGHT if anchor_best_match: weight += TLinkInfo.BEST_LINK_WEIGHT if income_anchor: weight += TLinkInfo.BEST_LINK_WEIGHT if income_url: weight += TLinkInfo.BEST_LINK_WEIGHT if svedenija_anchor: weight += TLinkInfo.NORMAL_LINK_WEIGHT if svedenija_url: weight += TLinkInfo.NORMAL_LINK_WEIGHT if year_anchor: weight += TLinkInfo.TRASH_LINK_WEIGHT # better than sub_page if income_page and weight > 0: weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING if corrupt_url and weight > 0: weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING if office_word: weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING if geo_leaf_word: weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING all_features = (("income_page", income_page), ("income_url", income_url), ('income_anchor', income_anchor), ('svedenija_anchor', svedenija_anchor), ('svedenija_url', svedenija_url), ("document_url", document_url), ("sub_page", sub_page), ("year_anchor", year_anchor), ("corrupt_url", corrupt_url), ('role_anchor', role_anchor), ('anchor_best_match', anchor_best_match), ('office_word', office_word), ('geo_leaf_word', geo_leaf_word)) all_features_str = ";".join(k for k, v in all_features if v) logger.debug("{}, weight={}, features: {}".format( positive_case, weight, all_features_str)) link_info.weight = weight return True return False
def add_regional_main_pages(self): for url in self.website.get_regional_pages(): link_info = TLinkInfo(TClickEngine.manual, self.website.main_page_url, url) link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT self.add_link_wrapper(link_info)