Exemple #1
    def add_link_wrapper(self, link_info: TLinkInfo):
        assert link_info.target_url is not None
            downloaded_file = TDownloadedFile(link_info.target_url)
        except THttpRequester.RobotHttpException as err:

        href = link_info.target_url

        self.website.url_nodes[link_info.source_url].add_child_link(href, link_info.to_json())
        depth = self.website.url_nodes[link_info.source_url].depth + 1

        if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION:
            html = downloaded_file.convert_html_to_utf8().lower()
            best_match_count = best_declaration_regex_match(html, from_start=False)
            if best_match_count > 0:
                add_weight = best_match_count * TLinkInfo.NORMAL_LINK_WEIGHT
                self.logger.debug("add weight {} to {} using best_declaration_regex_match".format(
                    add_weight, link_info.weight))
                link_info.weight += add_weight
        if depth < 15:
            link_info.weight -= 0.1 * depth
        elif depth < 30:
            link_info.weight -= 0.5 * depth
            link_info.weight -= 6.0 * depth

        link_info.weight = max(link_info.weight, self.url_to_weight.get(href, 0.0))
        self.url_to_weight[href] = link_info.weight

        if href not in self.website.url_nodes:
            if link_info.target_title is None and downloaded_file.file_extension == DEFAULT_HTML_EXTENSION:
                link_info.target_title = get_html_title(downloaded_file.data)
            self.website.url_nodes[href] = TUrlInfo(title=link_info.target_title, step_name=self.step_name, depth=depth,

        if self.is_last_step:
            self.website.export_env.export_file_if_relevant(downloaded_file, link_info)

        if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION:
            if self.website.export_env.sha256_is_exported(downloaded_file.get_sha256()):
                link_info.weight = TLinkInfo.MINIMAL_LINK_WEIGHT
                self.logger.debug("set weight {} to an html declaration".format(link_info.weight))

        if self.transitive:
            if href not in self.processed_pages:
                if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION:
                    self.pages_to_process[href] = link_info.weight

        if href in self.pages_to_process:
            self.pages_to_process[href] = max(self.pages_to_process[href], link_info.weight)

        self.logger.debug("add link {} weight={}".format(href, link_info.weight))
Exemple #2
    def use_search_engine(self, morda_url):
        request = self.search_engine['request']
        max_results = self.search_engine.get('max_serp_results', 10)
        site = self.website.main_page_url
        self.logger.info('search engine request: {} site:{}'.format(request, site))
        serp_urls = list()
        search_engine = None
        for search_engine in range(0, SearchEngineEnum.SearchEngineCount):
                serp_urls = SearchEngine.site_search(search_engine, site, request, self.get_selenium_driver())
            except (SerpException, THttpRequester.RobotHttpException, WebDriverException, InvalidSwitchToTargetException) as err:
                self.logger.error('cannot request search engine, exception: {}'.format(err))
                self.logger.debug("sleep 10 seconds and retry other search engine")

        links_count = 0
        for url in serp_urls:
            if not self.website.url_is_not_linked_to_another_project(url):
            link_info = TLinkInfo(TClickEngine.google, morda_url, url, anchor_text=request)
            link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT
            links_count += 1
            if max_results == 1:
                break  # one  link found
        self.logger.info('found {} links using search engine id={}'.format(links_count, search_engine))
Exemple #3
    def check_anticorr_link_text(self, link_info: TLinkInfo):
        text = link_info.anchor_text.strip().lower()
        if text.find('антикоррупционная комиссия') != -1:
            link_info.weight = TLinkInfo.BEST_LINK_WEIGHT
            return True

        if text.startswith(u'противодействие') or text.startswith(u'борьба') or text.startswith(u'нет'):
            if text.find("коррупц") != -1:
                link_info.weight = TLinkInfo.BEST_LINK_WEIGHT
                return True

        text = link_info.anchor_text.strip().lower()
        if text.find("отчеты") != -1:
            link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT
            return True
        return False
Exemple #4
 def add_links_from_sitemap_xml(self):
     if self.sitemap_xml_processor is None:
     assert self.website.main_page_url in self.website.url_nodes
     root_page = self.website.main_page_url.strip('/')
     tree = sitemap_tree_for_homepage(root_page)
     cnt = 0
     useful = 0
     for page in tree.all_pages():
         cnt += 1
         weight = self.sitemap_xml_processor(page.url)
         if weight > TLinkInfo.MINIMAL_LINK_WEIGHT:
             if page.url not in self.pages_to_process:
                 useful += 1
                 link_info = TLinkInfo(TClickEngine.sitemap_xml, self.website.main_page_url, page.url, anchor_text="")
                 link_info.weight = weight
     self.logger.info("processed {} links from {}/sitemap.xml found {} useful links".format(cnt, root_page, useful))
def looks_like_a_declaration_link_without_cache(logger, link_info: TLinkInfo):
    # here is a place for ML
    anchor_text_russified = normalize_and_russify_anchor_text(
    page_html = normalize_and_russify_anchor_text(link_info.page_html)
    positive_case = None
    anchor_best_match = False
    if best_declaration_regex_match(anchor_text_russified):
        anchor_best_match = True
        positive_case = "case 0"
    elif has_negative_words(anchor_text_russified):
        return False

    if link_info.target_url is not None:
        # we make a  http-head request here, that is rather slow
        file_extension = get_file_extension_only_by_headers(
        if is_video_or_audio_file_extension(file_extension):
            logger.debug("link {} looks like a media file, skipped".format(
            return False

    income_regexp = '(доход((ах)|(е)))|(коррупц)'
    sved_regexp = '(сведения)|(справк[аи])|(sveden)'
    svedenija_anchor = re.search(sved_regexp, anchor_text_russified) is not None or \
                       re.search(sved_regexp, link_info.anchor_text, re.IGNORECASE) is not None
    year_anchor = re.search('\\b20[0-9][0-9]\\b',
                            anchor_text_russified) is not None
    income_page = re.search(income_regexp, page_html) is not None
    source_page_title_has_income_word = re.search(
        income_regexp, link_info.source_page_title) is not None
    income_anchor = re.search(income_regexp, anchor_text_russified) is not None
    role_anchor = is_public_servant_role(anchor_text_russified)
    office_word = has_office_word_in_beginning(anchor_text_russified)
    geo_leaf_word = has_geo_leaf_word_in_beginning(anchor_text_russified)
    document_url = None
    sub_page = check_sub_page_or_iframe(logger, link_info)
    income_url, svedenija_url, corrupt_url = url_features(link_info.target_url)
    if link_info.element_class is not None:
        if isinstance(link_info.element_class, list):
            for css_class_name in link_info.element_class:
                if re.search(INCOME_URL_REGEXP, css_class_name, re.IGNORECASE):
                    income_url = True

    if positive_case is None:
        if income_page or income_url:
            if svedenija_anchor or year_anchor or sub_page:
                positive_case = "case 1"
                if document_url is None:
                    document_url = looks_like_a_document_link(
                        logger, link_info
                    )  #lazy calculaiton since it has a time-consuming head http-request
                if document_url:
                    positive_case = "case 1"

    # http://arshush.ru/index.php?option=com_content&task=blogcategory&id=62&Itemid=72
    # "Сведения за 2018 год" - no topic word
    if positive_case is None:
        if svedenija_anchor or svedenija_url:
            if year_anchor:
                positive_case = "case 2"
                if document_url is None:
                    document_url = looks_like_a_document_link(
                        logger, link_info)
                if document_url:
                    positive_case = "case 2"

    if positive_case is None:
        if (income_page or income_url) and role_anchor:
            positive_case = "case 3"

    if positive_case is None:
        if source_page_title_has_income_word and income_url:
            positive_case = "case 4"

    if positive_case is None:
        if office_word:
            positive_case = "case 5"

    if positive_case is None:
        if geo_leaf_word:
            positive_case = "case 6"

    if positive_case is None:
        #very special case for sudrf.ru (cannot use domain name here because of unittests)
        #may be it should be revised
        if link_info.target_url is not None  and link_info.target_url.find('name=anticorruption') != -1 and \
            anchor_text_russified is not None and anchor_text_russified.lower().strip().endswith('архив'):
            positive_case = "case 7"
            anchor_best_match = True

    if positive_case is not None:
        weight = TLinkInfo.MINIMAL_LINK_WEIGHT
        if anchor_best_match:
            weight += TLinkInfo.BEST_LINK_WEIGHT
        if income_anchor:
            weight += TLinkInfo.BEST_LINK_WEIGHT
        if income_url:
            weight += TLinkInfo.BEST_LINK_WEIGHT
        if svedenija_anchor:
            weight += TLinkInfo.NORMAL_LINK_WEIGHT
        if svedenija_url:
            weight += TLinkInfo.NORMAL_LINK_WEIGHT
        if year_anchor:
            weight += TLinkInfo.TRASH_LINK_WEIGHT  # better than sub_page
        if income_page and weight > 0:
            weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING
        if corrupt_url and weight > 0:
            weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING
        if office_word:
            weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING
        if geo_leaf_word:
            weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING

        all_features = (("income_page", income_page), ("income_url",
                        ('income_anchor', income_anchor), ('svedenija_anchor',
                        ('svedenija_url', svedenija_url), ("document_url",
                        ("sub_page", sub_page), ("year_anchor", year_anchor),
                        ("corrupt_url", corrupt_url), ('role_anchor',
                         anchor_best_match), ('office_word',
                                              office_word), ('geo_leaf_word',

        all_features_str = ";".join(k for k, v in all_features if v)
        logger.debug("{}, weight={}, features: {}".format(
            positive_case, weight, all_features_str))
        link_info.weight = weight
        return True
    return False
Exemple #6
 def add_regional_main_pages(self):
     for url in self.website.get_regional_pages():
         link_info = TLinkInfo(TClickEngine.manual, self.website.main_page_url, url)
         link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT