コード例 #1
0
ファイル: robot_step.py プロジェクト: TI-Russia/smart_parser
    def use_search_engine(self, morda_url):
        request = self.search_engine['request']
        max_results = self.search_engine.get('max_serp_results', 10)
        site = self.website.main_page_url
        self.logger.info('search engine request: {} site:{}'.format(request, site))
        serp_urls = list()
        search_engine = None
        for search_engine in range(0, SearchEngineEnum.SearchEngineCount):
            try:
                serp_urls = SearchEngine.site_search(search_engine, site, request, self.get_selenium_driver())
                break
            except (SerpException, THttpRequester.RobotHttpException, WebDriverException, InvalidSwitchToTargetException) as err:
                self.logger.error('cannot request search engine, exception: {}'.format(err))
                self.logger.debug("sleep 10 seconds and retry other search engine")
                time.sleep(10)
                self.get_selenium_driver().restart()
                time.sleep(5)
                self.logger.error('retry...')

        links_count = 0
        for url in serp_urls:
            if not self.website.url_is_not_linked_to_another_project(url):
                continue
            link_info = TLinkInfo(TClickEngine.google, morda_url, url, anchor_text=request)
            link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT
            self.add_link_wrapper(link_info)
            links_count += 1
            if max_results == 1:
                break  # one  link found
        self.logger.info('found {} links using search engine id={}'.format(links_count, search_engine))
コード例 #2
0
    def click_element(self, element, link_info: TLinkInfo):
        if self.download_folder is not None:
            make_folder_empty(self.download_folder)
        assert link_info.target_url is None
        save_current_url = self.the_driver.current_url  # may differ from link_info.SourceUrl, because of redirects

        ActionChains(self.the_driver)\
            .move_to_element(element)\
            .pause(1)\
            .key_down(Keys.CONTROL) \
            .click(element)\
            .key_up(Keys.CONTROL) \
            .perform()

        time.sleep(3)

        if self.download_folder is not None:
            link_info.downloaded_file = self.wait_download_finished(180)

        if len(self.the_driver.window_handles) > 1:
            try:
                self.the_driver.set_page_load_timeout(2)
                self.the_driver.switch_to.window(
                    self.the_driver.window_handles[len(self.the_driver.window_handles) - 1])
                if self.the_driver.current_url != link_info.source_url and self.the_driver.current_url != 'about:blank':
                    link_info.set_target(self.the_driver.current_url, self.the_driver.title)
            except Exception as exp:
                pass
            finally:
                self.the_driver.set_page_load_timeout(self.page_load_timeout)
        self.close_not_first_tab()
        self.the_driver.switch_to.window(self.the_driver.window_handles[0])
        if self.the_driver.current_url != save_current_url:
            self.logger.error("cannot switch to the saved url must be {}, got {}, keep going".format(
                save_current_url, self.the_driver.current_url))
コード例 #3
0
ファイル: robot_step.py プロジェクト: TI-Russia/smart_parser
    def add_link_wrapper(self, link_info: TLinkInfo):
        assert link_info.target_url is not None
        try:
            downloaded_file = TDownloadedFile(link_info.target_url)
        except THttpRequester.RobotHttpException as err:
            self.logger.error(err)
            return

        href = link_info.target_url

        self.website.url_nodes[link_info.source_url].add_child_link(href, link_info.to_json())
        depth = self.website.url_nodes[link_info.source_url].depth + 1

        if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION:
            html = downloaded_file.convert_html_to_utf8().lower()
            best_match_count = best_declaration_regex_match(html, from_start=False)
            if best_match_count > 0:
                add_weight = best_match_count * TLinkInfo.NORMAL_LINK_WEIGHT
                self.logger.debug("add weight {} to {} using best_declaration_regex_match".format(
                    add_weight, link_info.weight))
                link_info.weight += add_weight
        if depth < 15:
            link_info.weight -= 0.1 * depth
        elif depth < 30:
            link_info.weight -= 0.5 * depth
        else:
            link_info.weight -= 6.0 * depth

        link_info.weight = max(link_info.weight, self.url_to_weight.get(href, 0.0))
        self.url_to_weight[href] = link_info.weight

        if href not in self.website.url_nodes:
            if link_info.target_title is None and downloaded_file.file_extension == DEFAULT_HTML_EXTENSION:
                link_info.target_title = get_html_title(downloaded_file.data)
            self.website.url_nodes[href] = TUrlInfo(title=link_info.target_title, step_name=self.step_name, depth=depth,
                                                    parent_node=link_info.source_url)
        else:
            self.website.url_nodes[href].add_parent_node(link_info.source_url)
            self.website.url_nodes[href].update_depth(depth)

        if self.is_last_step:
            self.website.export_env.export_file_if_relevant(downloaded_file, link_info)

        if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION:
            if self.website.export_env.sha256_is_exported(downloaded_file.get_sha256()):
                link_info.weight = TLinkInfo.MINIMAL_LINK_WEIGHT
                self.logger.debug("set weight {} to an html declaration".format(link_info.weight))

        if self.transitive:
            if href not in self.processed_pages:
                if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION:
                    self.pages_to_process[href] = link_info.weight

        if href in self.pages_to_process:
            self.pages_to_process[href] = max(self.pages_to_process[href], link_info.weight)

        self.logger.debug("add link {} weight={}".format(href, link_info.weight))
コード例 #4
0
ファイル: robot_step.py プロジェクト: TI-Russia/smart_parser
 def looks_like_a_declaration_link(self, link_info: TLinkInfo):
     # return looks_like_a_declaration_link_without_cache(self.logger, link_info)
     if link_info.is_hashable():
         result = self.declaration_links_cache.get(link_info.hash_by_target())
         if result is not None:
             return result
     result = looks_like_a_declaration_link_without_cache(self.logger, link_info)
     if link_info.is_hashable():
         self.declaration_links_cache[link_info.hash_by_target()] = result
     return result
コード例 #5
0
ファイル: robot_step.py プロジェクト: TI-Russia/smart_parser
    def check_anticorr_link_text(self, link_info: TLinkInfo):
        text = link_info.anchor_text.strip().lower()
        if text.find('антикоррупционная комиссия') != -1:
            link_info.weight = TLinkInfo.BEST_LINK_WEIGHT
            return True

        if text.startswith(u'противодействие') or text.startswith(u'борьба') or text.startswith(u'нет'):
            if text.find("коррупц") != -1:
                link_info.weight = TLinkInfo.BEST_LINK_WEIGHT
                return True

        text = link_info.anchor_text.strip().lower()
        if text.find("отчеты") != -1:
            link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT
            return True
        return False
コード例 #6
0
ファイル: robot_step.py プロジェクト: TI-Russia/smart_parser
 def add_downloaded_file_manually(self, downloaded_file: TDownloadedFile, href=None, declaration_year=None):
     if href is None:
         href = self.website.main_page_url
     link_info = TLinkInfo(TClickEngine.selenium, self.website.main_page_url, href,
                           source_html="", anchor_text="", tag_name="a",
                           element_index=1, downloaded_file=downloaded_file,
                           declaration_year=declaration_year)
     self.add_downloaded_file_wrapper(link_info)
コード例 #7
0
def gossov_tatarstan_ru(web_site: TWebSiteCrawlSnapshot):
    web_site.create_export_folder()
    robot_step = web_site.robot_steps[-1]
    driver = robot_step.get_selenium_driver()
    elements = driver.navigate_and_get_links_js("https://gossov.tatarstan.ru/structure/deputaty")
    for element in elements:
        person_href = element['href']
        if person_href is not None and person_href.find('person_id') != -1:
            robot_step.add_link_wrapper(TLinkInfo(TClickEngine.manual, robot_step.website.main_page_url, person_href))
            file = TDownloadedFile(person_href)
            parser = THtmlParser(file.data, url=person_href)
            for html_link in THtmlParser(file.data, url=person_href).soup.findAll("a"):
                href_pdf = html_link.attrs.get('href', '')
                if href_pdf.find('revenue') != -1:
                    href_pdf = parser.make_link_soup(href_pdf)
                    robot_step.add_link_wrapper (TLinkInfo(TClickEngine.manual, person_href, href_pdf))
                    
コード例 #8
0
 def check_follow(self, src, trg, canon):
     if not src.startswith('http'):
         src = 'http://' + src
     if not trg.startswith('http'):
         trg = 'http://' + trg
     link_info = TLinkInfo(TClickEngine.selenium, src, trg)
     can_follow = self.robot_step.can_follow_this_link(link_info)
     self.assertEqual(canon, can_follow, msg="{} -> {}".format(src, trg))
コード例 #9
0
ファイル: robot_step.py プロジェクト: TI-Russia/smart_parser
 def add_links_from_sitemap_xml(self):
     if self.sitemap_xml_processor is None:
         return
     assert self.website.main_page_url in self.website.url_nodes
     root_page = self.website.main_page_url.strip('/')
     tree = sitemap_tree_for_homepage(root_page)
     cnt = 0
     useful = 0
     for page in tree.all_pages():
         cnt += 1
         weight = self.sitemap_xml_processor(page.url)
         if weight > TLinkInfo.MINIMAL_LINK_WEIGHT:
             if page.url not in self.pages_to_process:
                 useful += 1
                 link_info = TLinkInfo(TClickEngine.sitemap_xml, self.website.main_page_url, page.url, anchor_text="")
                 link_info.weight = weight
                 self.add_link_wrapper(link_info)
     self.logger.info("processed {} links from {}/sitemap.xml found {} useful links".format(cnt, root_page, useful))
コード例 #10
0
 def test_download_doc(self):
     shutil.rmtree(TDownloadEnv.get_download_folder(), ignore_errors=True)
     elements = self.get_all_link_elements('http://aot.ru/doc_examples/test.html')
     url_and_elements = self.check_anchor(elements, "test.doc")
     url, element = list(url_and_elements)[0]
     link_info = TLinkInfo(TClickEngine.selenium, url, None)
     self.driver_holder.click_element(element, link_info)
     self.driver_holder.wait_download_finished()
     download_files = os.listdir(TDownloadEnv.get_download_folder())
     self.assertTrue(len(download_files), 1)
コード例 #11
0
def click(driver_holder, url, element_index):
    elements = driver_holder.navigate_and_get_links_js(url)
    element = elements[element_index]['id']
    print("click element {} anchor={}".format(element_index, element.text))
    link_info = TLinkInfo(TClickEngine.selenium,
                          url,
                          None,
                          anchor_text=element.text)
    driver_holder.click_element(element, link_info)
    print("href={}".format(link_info.target_url))
    print("downloaded_file={}".format(link_info.downloaded_file))
コード例 #12
0
ファイル: robot_step.py プロジェクト: TI-Russia/smart_parser
    def build_link_info(self, main_url, page_html, element_index, element, html_title):
        link_text = element['anchor'].strip('\n\r\t ') if element['anchor'] is not None else ""

        return TLinkInfo(TClickEngine.selenium,
                              source_url=main_url,
                              target_url=element['href'],
                              source_html=page_html,
                              anchor_text=link_text,
                              tag_name=element['id'].tag_name,
                              element_index=element_index,
                              element_class=[element.get('class')],
                              source_page_title=html_title)
コード例 #13
0
ファイル: robot_step.py プロジェクト: TI-Russia/smart_parser
    def click_selenium_if_no_href(self, main_url, element, element_index, check_link_func):
        tag_name = element.tag_name
        link_text = element.text.strip('\n\r\t ')  # initialize here, can be broken after click
        page_html = self.get_selenium_driver().the_driver.page_source
        THttpRequester.consider_request_policy(main_url + " elem_index=" + str(element_index), "click_selenium")

        link_info = TLinkInfo(TClickEngine.selenium, main_url, None,
                              source_html=page_html, anchor_text=link_text, tag_name=tag_name,
                              element_index=element_index,
                              source_page_title=self.get_selenium_driver().the_driver.title)

        self.get_selenium_driver().click_element(element, link_info)

        if self.normalize_and_check_link(link_info, check_link_func):
            if link_info.downloaded_file is not None:
                self.add_downloaded_file_wrapper(link_info)
            elif link_info.target_url is not None:
                self.add_link_wrapper(link_info)
コード例 #14
0
ファイル: test_mid.py プロジェクト: TI-Russia/smart_parser
    def test_mid_video(self):
        THttpRequester.ENABLE_HEAD_REQUESTS = True
        link_info = TLinkInfo(TClickEngine.selenium,
            source_url='https://www.mid.ru/ru/brifingi/-/asset_publisher/MCZ7HQuMdqBY/content/id/4781270#12',
            target_url='https://www.mid.ru/documents/10180/4780294/210610%281%29.mp4/8acd221f-cb28-4522-a251-5437b160672e'
        )
        logger = self.logger

        class TDummyProject:
            def __init__(self):
                self.config = TRobotConfig.read_by_config_type("prod")

        class TDummyOffice:
            def __init__(self):
                self.logger = logger
                self.parent_project = TDummyProject()

        step_info = TRobotStep(TDummyOffice())
        res = step_info.normalize_and_check_link(link_info, TRobotStep.looks_like_a_declaration_link)
        self.assertFalse(res)
コード例 #15
0
 def filter_link_elements_by_anchor(self, link_elements, start_anchor_text):
     urls_and_elements = set()
     for element_index, element in enumerate(link_elements):
         try:
             if element['anchor'] is None:
                 continue
             link_text = element['anchor'].strip('\n\r\t ')
             self.logger.debug("check link anchor={}, element_index={}".format(link_text, element_index))
             if link_text.lower().startswith(start_anchor_text.lower()):
                 self.logger.debug("found link anchor={}".format(link_text))
                 href = element['href']
                 if href is None:
                     link_info = TLinkInfo(TClickEngine.selenium, driver_holder.the_driver.current_url, None)
                     self.driver_holder.click_element(element['id'], link_info)
                     href = self.driver_holder.the_driver.current_url
                     self.driver_holder.the_driver.back()
                 urls_and_elements.add((href, element['id']))
         except Exception as exp:
             self.logger.error(exp)
     return urls_and_elements
コード例 #16
0
def looks_like_a_declaration_link_without_cache(logger, link_info: TLinkInfo):
    # here is a place for ML
    anchor_text_russified = normalize_and_russify_anchor_text(
        link_info.anchor_text)
    page_html = normalize_and_russify_anchor_text(link_info.page_html)
    positive_case = None
    anchor_best_match = False
    if best_declaration_regex_match(anchor_text_russified):
        anchor_best_match = True
        positive_case = "case 0"
    elif has_negative_words(anchor_text_russified):
        return False

    if link_info.target_url is not None:
        # we make a  http-head request here, that is rather slow
        file_extension = get_file_extension_only_by_headers(
            link_info.target_url)
        if is_video_or_audio_file_extension(file_extension):
            logger.debug("link {} looks like a media file, skipped".format(
                link_info.target_url))
            return False

    income_regexp = '(доход((ах)|(е)))|(коррупц)'
    sved_regexp = '(сведения)|(справк[аи])|(sveden)'
    svedenija_anchor = re.search(sved_regexp, anchor_text_russified) is not None or \
                       re.search(sved_regexp, link_info.anchor_text, re.IGNORECASE) is not None
    year_anchor = re.search('\\b20[0-9][0-9]\\b',
                            anchor_text_russified) is not None
    income_page = re.search(income_regexp, page_html) is not None
    source_page_title_has_income_word = re.search(
        income_regexp, link_info.source_page_title) is not None
    income_anchor = re.search(income_regexp, anchor_text_russified) is not None
    role_anchor = is_public_servant_role(anchor_text_russified)
    office_word = has_office_word_in_beginning(anchor_text_russified)
    geo_leaf_word = has_geo_leaf_word_in_beginning(anchor_text_russified)
    document_url = None
    sub_page = check_sub_page_or_iframe(logger, link_info)
    income_url, svedenija_url, corrupt_url = url_features(link_info.target_url)
    if link_info.element_class is not None:
        if isinstance(link_info.element_class, list):
            for css_class_name in link_info.element_class:
                if re.search(INCOME_URL_REGEXP, css_class_name, re.IGNORECASE):
                    income_url = True

    if positive_case is None:
        if income_page or income_url:
            if svedenija_anchor or year_anchor or sub_page:
                positive_case = "case 1"
            else:
                if document_url is None:
                    document_url = looks_like_a_document_link(
                        logger, link_info
                    )  #lazy calculaiton since it has a time-consuming head http-request
                if document_url:
                    positive_case = "case 1"

    # http://arshush.ru/index.php?option=com_content&task=blogcategory&id=62&Itemid=72
    # "Сведения за 2018 год" - no topic word
    if positive_case is None:
        if svedenija_anchor or svedenija_url:
            if year_anchor:
                positive_case = "case 2"
            else:
                if document_url is None:
                    document_url = looks_like_a_document_link(
                        logger, link_info)
                if document_url:
                    positive_case = "case 2"

    if positive_case is None:
        if (income_page or income_url) and role_anchor:
            positive_case = "case 3"

    if positive_case is None:
        if source_page_title_has_income_word and income_url:
            positive_case = "case 4"

    if positive_case is None:
        if office_word:
            positive_case = "case 5"

    if positive_case is None:
        if geo_leaf_word:
            positive_case = "case 6"

    if positive_case is None:
        #very special case for sudrf.ru (cannot use domain name here because of unittests)
        #may be it should be revised
        #http://oblsud.ros.sudrf.ru/modules.php?name=anticorruption&rid=6
        if link_info.target_url is not None  and link_info.target_url.find('name=anticorruption') != -1 and \
            anchor_text_russified is not None and anchor_text_russified.lower().strip().endswith('архив'):
            positive_case = "case 7"
            anchor_best_match = True

    if positive_case is not None:
        weight = TLinkInfo.MINIMAL_LINK_WEIGHT
        if anchor_best_match:
            weight += TLinkInfo.BEST_LINK_WEIGHT
        if income_anchor:
            weight += TLinkInfo.BEST_LINK_WEIGHT
        if income_url:
            weight += TLinkInfo.BEST_LINK_WEIGHT
        if svedenija_anchor:
            weight += TLinkInfo.NORMAL_LINK_WEIGHT
        if svedenija_url:
            weight += TLinkInfo.NORMAL_LINK_WEIGHT
        if year_anchor:
            weight += TLinkInfo.TRASH_LINK_WEIGHT  # better than sub_page
        if income_page and weight > 0:
            weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING
        if corrupt_url and weight > 0:
            weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING
        if office_word:
            weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING
        if geo_leaf_word:
            weight += TLinkInfo.LINK_WEIGHT_FOR_INCREMENTING

        all_features = (("income_page", income_page), ("income_url",
                                                       income_url),
                        ('income_anchor', income_anchor), ('svedenija_anchor',
                                                           svedenija_anchor),
                        ('svedenija_url', svedenija_url), ("document_url",
                                                           document_url),
                        ("sub_page", sub_page), ("year_anchor", year_anchor),
                        ("corrupt_url", corrupt_url), ('role_anchor',
                                                       role_anchor),
                        ('anchor_best_match',
                         anchor_best_match), ('office_word',
                                              office_word), ('geo_leaf_word',
                                                             geo_leaf_word))

        all_features_str = ";".join(k for k, v in all_features if v)
        logger.debug("{}, weight={}, features: {}".format(
            positive_case, weight, all_features_str))
        link_info.weight = weight
        return True
    return False
コード例 #17
0
ファイル: robot_step.py プロジェクト: TI-Russia/smart_parser
 def add_regional_main_pages(self):
     for url in self.website.get_regional_pages():
         link_info = TLinkInfo(TClickEngine.manual, self.website.main_page_url, url)
         link_info.weight = TLinkInfo.NORMAL_LINK_WEIGHT
         self.add_link_wrapper(link_info)