Ejemplo n.º 1
0
    def site_search(search_engine,
                    site_url,
                    query,
                    selenium_holder: TSeleniumDriver,
                    enable_cache=True):

        if enable_cache:
            cached_results = SearchEngine.read_cache(site_url, query)
            if len(cached_results) > 0:
                return cached_results['urls']

        request_parts = ["site:{}".format(site_url), query]
        random.shuffle(request_parts)  # more random
        search_engine_request = " ".join(request_parts)

        if SearchEngine.is_search_engine_ref(
                query) or SearchEngine.is_search_engine_ref(site_url):
            selenium_holder.logger.error(
                "Warning! we use keyword 'google' to filter results out, search would yield no results"
            )

        search_results = SearchEngine.send_request(search_engine,
                                                   search_engine_request,
                                                   selenium_holder)

        if len(search_results) == 0:
            html = selenium_holder.the_driver.page_source
            if html.find("ничего не нашлось") == -1 or html.find("ничего не найдено") == -1 \
                or html.find('did not match any documents') == -1:
                #with open("debug_captcha.html", "w") as outp:
                #    outp.write(selenium_holder.the_driver.page_source)
                raise SerpException(
                    "no search results, look in debug_captcha.html, may be captcha"
                )

        site_search_results = list()
        # https://www.mos.ru/dgi/ -> mos.ru/dgi
        web_site = strip_scheme_and_query(site_url)
        for url in search_results:
            if strip_scheme_and_query(url).startswith(web_site):
                if url not in site_search_results:
                    site_search_results.append(url)

        if enable_cache:
            if len(search_results) > 0:
                SearchEngine._write_cache(selenium_holder.logger, site_url,
                                          query, site_search_results)

        return site_search_results
Ejemplo n.º 2
0
    def check_alive_one_url(self, site_url, complete_bans, site_info=None):
        site_info: TDeclarationWebSite
        if site_info is None:
            site_info = self.web_sites.get_web_site(site_url)
        web_site = self.browse_one_url(site_url)
        #office = self.web_sites.get_office(site_url)
        office = site_info.parent_office
        if web_site is None:
            self.logger.info("     {} is dead".format(site_url))
            site_info.ban()
            complete_bans.append(site_url)
        else:
            new_site_url = web_site.get_main_url_protocol(
            ) + "://" + strip_scheme_and_query(web_site.main_page_url)
            title = web_site.get_title(web_site.main_page_url)
            if strip_scheme_and_query(
                    web_site.main_page_url).strip('/') != site_url.strip('/'):
                self.logger.info(
                    '   {} is alive, but is redirected to {}'.format(
                        site_url, new_site_url))
                new_site_info = None
                for u in office.office_web_sites:
                    if u.url == site_url:
                        u.set_redirect(new_site_url)
                    if u.url == new_site_url:
                        new_site_info = u
                if new_site_info is None:
                    new_site_info = TDeclarationWebSite(url=new_site_url)
                    office.office_web_sites.append(new_site_info)
                new_site_info.set_title(title)
            else:
                self.logger.info("     {} is alive, main_page_url = {}".format(
                    site_url, web_site.main_page_url))
                site_info.set_title(title)

            if web_site.main_page_source.lower().find('коррупц') != -1:
                self.logger.info(
                    "site contains corruption keyword {}".format(site_url))
                site_info.corruption_keyword_in_html = True

            if self.args.main_page_path:
                try:
                    with open(
                            self.get_external_file_name_by_site_url(site_url),
                            "w") as outp:
                        outp.write(web_site.main_page_source)
                except Exception as exp:
                    self.logger.error(
                        "cannot save page html to file: {} ".format(site_url))
Ejemplo n.º 3
0
 def read_web_domains_from_file(self):
     self.logger.info("read url list from {}".format(self.args.url_list))
     web_domains = list()
     with open(self.args.url_list) as inp:
         for url in inp:
             url = url.strip(" \r\n")
             if url.startswith('http'):
                 web_domains.append(strip_scheme_and_query(url))
             else:
                 web_domains.append(url)
     return web_domains
Ejemplo n.º 4
0
    def select_adhoc(self):
        good_web_domains = set(self.read_web_domains_from_file())
        office: TOfficeInMemory
        ban_cnt = 0
        sp_left = 0
        for office in self.web_sites.offices.offices.values():
            if office.is_from_spravochnik():
                w: TDeclarationWebSite

                for w in office.office_web_sites:
                    if not w.can_communicate():
                        continue
                    u = strip_scheme_and_query(w.url)
                    if u in good_web_domains or "{}/".format(
                            u) in good_web_domains:
                        sp_left += 1
                        continue
                    ban_cnt += 1
                    self.logger.debug("ban office_id={}".format(
                        office.office_id))
                    w.ban(TWebSiteReachStatus.unpromising)
        self.logger.info("ban {} sites, left in spravochnik {}".format(
            ban_cnt, sp_left))
Ejemplo n.º 5
0
 def test_url_strip(self):
     self.assertEqual(strip_scheme_and_query('https://aot.ru/test'), 'aot.ru/test')
     self.assertEqual(strip_scheme_and_query('https://www.aot.ru/test'), 'aot.ru/test')
     self.assertEqual(strip_scheme_and_query('www.aot.ru/test'), 'aot.ru/test')
     self.assertEqual(strip_scheme_and_query('www.aot.ru/test'), 'aot.ru/test')
     self.assertEqual(strip_scheme_and_query('https://xn--80agabx3af.xn--p1ai/'), 'дагогни.рф')
Ejemplo n.º 6
0
 def get_site_url(self):
     # 1. in many cases this function returns the web domain, sometimes it can return the web domain and url path
     # like "mos.ru/dpi"
     # 2. self.get_site_url() can differ from self.input_site_url, if there is a new http-redirection
     return strip_scheme_and_query(self.main_page_url)