Esempio n. 1
0
    def get_url_list(self, start_selenium=False):
        web_domains = list()
        if self.args.filter_by_source is not None:
            web_domains = list()
            for k in self.web_sites.web_sites.values():
                if k.parent_office.source_id == self.args.filter_by_source:
                    web_domains.append(get_site_url(k.url))
        elif self.args.url_list is not None:
            web_domains = self.read_web_domains_from_file()
        else:
            #take all web domains
            web_domains = list(self.web_sites.web_sites.keys())

        domains_filtered = list(w for w in web_domains
                                if self.check_web_site_filters(w))

        self.logger.info("we are going to process {} web sites".format(
            len(domains_filtered)))

        if start_selenium:
            TDownloadEnv.FILE_CACHE_FOLDER = TDownloadEnv.FILE_CACHE_FOLDER + "_{}_{}".format(
                time.time(), os.getpid())
            self.logger.info("rm {}".format(TDownloadEnv.FILE_CACHE_FOLDER))
            TDownloadEnv.clear_cache_folder()
            project_path = "project.txt"
            TRobotProject.create_project("dummy.ru", project_path)
            with TRobotProject(
                    self.logger, project_path,
                    export_folder="result") as self.temp_dlrobot_project:
                for w in domains_filtered:
                    yield w
                os.unlink(project_path)
        else:
            for w in domains_filtered:
                yield w
Esempio n. 2
0
 def create_departments(self):
     o: TOfficeInMemory
     TDownloadEnv.clear_cache_folder()
     project_path = "project.txt"
     TRobotProject.create_project("dummy.ru",
                                  project_path,
                                  web_sites_db=self.web_sites)
     with TRobotProject(
             self.logger, project_path,
             export_folder="result") as self.temp_dlrobot_project:
         for o in self.web_sites.offices.values():
             if o.parent_id == self.args.parent_office_id:
                 self.logger.info("ofiice id = {}, {}".format(
                     o.office_id, o.name))
                 query = self.args.query_template.format(o.name)
                 engine = random.choice(
                     [SearchEngineEnum.GOOGLE, SearchEngineEnum.YANDEX])
                 results = SearchEngine.send_request(
                     engine, query,
                     self.temp_dlrobot_project.selenium_driver)
                 if len(results) == 0:
                     msg = "cannot find results fo query {}".format(query)
                     self.logger.error(msg)
                 else:
                     new_web_site = TDeclarationWebSite(url=results[0])
                     found = False
                     for u in o.office_web_sites:
                         if u.url == new_web_site:
                             found = True
                             self.logger.error(
                                 "{} already exists".format(new_web_site))
                     if not found:
                         o.office_web_sites.append(new_web_site)
                         self.check_alive_one_url(new_web_site.url)
                 time.sleep(20)
Esempio n. 3
0
    def setUp(self):
        self.env = TestDlrobotEnv("data.ssl")

        TDownloadEnv.clear_cache_folder()
        THttpRequester.ENABLE = False
        logger = setup_logging(log_file_name="dlrobot.log")
        THttpRequester.initialize(logger)
Esempio n. 4
0
 def setUp(self):
     self.server_address = '127.0.0.1:{}'.format(self.web_site_port)
     self.web_server = TestHTTPServer(self.web_site_port)
     threading.Thread(target=start_server, args=(self.web_server,)).start()
     time.sleep(1)
     self.env = TestDlrobotEnv("data.timeout")
     TDownloadEnv.clear_cache_folder()
     self.logger = setup_logging(log_file_name="dlrobot.log")
     THttpRequester.initialize(self.logger)
Esempio n. 5
0
 def test_download_doc(self):
     shutil.rmtree(TDownloadEnv.get_download_folder(), ignore_errors=True)
     elements = self.get_all_link_elements('http://aot.ru/doc_examples/test.html')
     url_and_elements = self.check_anchor(elements, "test.doc")
     url, element = list(url_and_elements)[0]
     link_info = TLinkInfo(TClickEngine.selenium, url, None)
     self.driver_holder.click_element(element, link_info)
     self.driver_holder.wait_download_finished()
     download_files = os.listdir(TDownloadEnv.get_download_folder())
     self.assertTrue(len(download_files), 1)
Esempio n. 6
0
 def __init__(self, args):
     self.args = args
     self.logger = setup_logging(log_file_name=args.logfile,
                                 logger_name="dlr")
     self.config = TRobotConfig.read_by_config_type(self.args.config_type)
     self.config.update_from_program_args(self.args)
     self.logger.debug("crawling_timeout={}".format(
         self.config.crawling_timeout))
     TDownloadEnv.init_conversion(self.logger)
     THttpRequester.initialize(self.logger)
     if args.clear_cache_folder:
         TDownloadEnv.clear_cache_folder()
    def collect_links_selenium(
            self,
            start_url,
            link_func=TRobotStep.looks_like_a_declaration_link,
            is_last_step=False):

        TDownloadEnv.clear_cache_folder()
        robot_steps = [{'step_name': "declarations"}]
        with TRobotProject(
                self.logger,
                "project.txt",
                TRobotConfig(passport_steps=robot_steps),
                "result",
                enable_search_engine=False,
        ) as project:
            project.read_project()
            office_info = project.web_site_snapshots[0]
            office_info.create_export_folder()

            step_info = TRobotStep(office_info,
                                   step_name="declarations",
                                   is_last_step=is_last_step)
            if isinstance(start_url, list):
                for x in start_url:
                    step_info.pages_to_process[x] = 0
                    office_info.url_nodes[x] = TUrlInfo(title="",
                                                        step_name=None)
            else:
                office_info.url_nodes[start_url] = TUrlInfo(title="",
                                                            step_name=None)
                step_info.pages_to_process[start_url] = 0

            step_info.processed_pages = set()
            step_info.apply_function_to_links(link_func)
            links = dict()
            for url, weight in step_info.url_to_weight.items():
                u = list(urllib.parse.urlparse(url))
                u[1] = "dummy"
                links[urllib.parse.urlunparse(u)] = weight

            for url_info in office_info.url_nodes.values():
                for d in url_info.downloaded_files:
                    links[d.downloaded_file] = 1
            return links
Esempio n. 8
0
 def get_cached_file_name(site_url, query):
     filename = unidecode(site_url + " " + query)
     filename = re.sub('[ :"\\/]', "_", filename)
     return os.path.join(TDownloadEnv.get_search_engine_cache_folder(),
                         filename)