def get_url_list(self, start_selenium=False): web_domains = list() if self.args.filter_by_source is not None: web_domains = list() for k in self.web_sites.web_sites.values(): if k.parent_office.source_id == self.args.filter_by_source: web_domains.append(get_site_url(k.url)) elif self.args.url_list is not None: web_domains = self.read_web_domains_from_file() else: #take all web domains web_domains = list(self.web_sites.web_sites.keys()) domains_filtered = list(w for w in web_domains if self.check_web_site_filters(w)) self.logger.info("we are going to process {} web sites".format( len(domains_filtered))) if start_selenium: TDownloadEnv.FILE_CACHE_FOLDER = TDownloadEnv.FILE_CACHE_FOLDER + "_{}_{}".format( time.time(), os.getpid()) self.logger.info("rm {}".format(TDownloadEnv.FILE_CACHE_FOLDER)) TDownloadEnv.clear_cache_folder() project_path = "project.txt" TRobotProject.create_project("dummy.ru", project_path) with TRobotProject( self.logger, project_path, export_folder="result") as self.temp_dlrobot_project: for w in domains_filtered: yield w os.unlink(project_path) else: for w in domains_filtered: yield w
def create_departments(self): o: TOfficeInMemory TDownloadEnv.clear_cache_folder() project_path = "project.txt" TRobotProject.create_project("dummy.ru", project_path, web_sites_db=self.web_sites) with TRobotProject( self.logger, project_path, export_folder="result") as self.temp_dlrobot_project: for o in self.web_sites.offices.values(): if o.parent_id == self.args.parent_office_id: self.logger.info("ofiice id = {}, {}".format( o.office_id, o.name)) query = self.args.query_template.format(o.name) engine = random.choice( [SearchEngineEnum.GOOGLE, SearchEngineEnum.YANDEX]) results = SearchEngine.send_request( engine, query, self.temp_dlrobot_project.selenium_driver) if len(results) == 0: msg = "cannot find results fo query {}".format(query) self.logger.error(msg) else: new_web_site = TDeclarationWebSite(url=results[0]) found = False for u in o.office_web_sites: if u.url == new_web_site: found = True self.logger.error( "{} already exists".format(new_web_site)) if not found: o.office_web_sites.append(new_web_site) self.check_alive_one_url(new_web_site.url) time.sleep(20)
def setUp(self): self.env = TestDlrobotEnv("data.ssl") TDownloadEnv.clear_cache_folder() THttpRequester.ENABLE = False logger = setup_logging(log_file_name="dlrobot.log") THttpRequester.initialize(logger)
def setUp(self): self.server_address = '127.0.0.1:{}'.format(self.web_site_port) self.web_server = TestHTTPServer(self.web_site_port) threading.Thread(target=start_server, args=(self.web_server,)).start() time.sleep(1) self.env = TestDlrobotEnv("data.timeout") TDownloadEnv.clear_cache_folder() self.logger = setup_logging(log_file_name="dlrobot.log") THttpRequester.initialize(self.logger)
def test_download_doc(self): shutil.rmtree(TDownloadEnv.get_download_folder(), ignore_errors=True) elements = self.get_all_link_elements('http://aot.ru/doc_examples/test.html') url_and_elements = self.check_anchor(elements, "test.doc") url, element = list(url_and_elements)[0] link_info = TLinkInfo(TClickEngine.selenium, url, None) self.driver_holder.click_element(element, link_info) self.driver_holder.wait_download_finished() download_files = os.listdir(TDownloadEnv.get_download_folder()) self.assertTrue(len(download_files), 1)
def __init__(self, args): self.args = args self.logger = setup_logging(log_file_name=args.logfile, logger_name="dlr") self.config = TRobotConfig.read_by_config_type(self.args.config_type) self.config.update_from_program_args(self.args) self.logger.debug("crawling_timeout={}".format( self.config.crawling_timeout)) TDownloadEnv.init_conversion(self.logger) THttpRequester.initialize(self.logger) if args.clear_cache_folder: TDownloadEnv.clear_cache_folder()
def collect_links_selenium( self, start_url, link_func=TRobotStep.looks_like_a_declaration_link, is_last_step=False): TDownloadEnv.clear_cache_folder() robot_steps = [{'step_name': "declarations"}] with TRobotProject( self.logger, "project.txt", TRobotConfig(passport_steps=robot_steps), "result", enable_search_engine=False, ) as project: project.read_project() office_info = project.web_site_snapshots[0] office_info.create_export_folder() step_info = TRobotStep(office_info, step_name="declarations", is_last_step=is_last_step) if isinstance(start_url, list): for x in start_url: step_info.pages_to_process[x] = 0 office_info.url_nodes[x] = TUrlInfo(title="", step_name=None) else: office_info.url_nodes[start_url] = TUrlInfo(title="", step_name=None) step_info.pages_to_process[start_url] = 0 step_info.processed_pages = set() step_info.apply_function_to_links(link_func) links = dict() for url, weight in step_info.url_to_weight.items(): u = list(urllib.parse.urlparse(url)) u[1] = "dummy" links[urllib.parse.urlunparse(u)] = weight for url_info in office_info.url_nodes.values(): for d in url_info.downloaded_files: links[d.downloaded_file] = 1 return links
def get_cached_file_name(site_url, query): filename = unidecode(site_url + " " + query) filename = re.sub('[ :"\\/]', "_", filename) return os.path.join(TDownloadEnv.get_search_engine_cache_folder(), filename)