def get_url_list(self, start_selenium=False): web_domains = list() if self.args.filter_by_source is not None: web_domains = list() for k in self.web_sites.web_sites.values(): if k.parent_office.source_id == self.args.filter_by_source: web_domains.append(get_site_url(k.url)) elif self.args.url_list is not None: web_domains = self.read_web_domains_from_file() else: #take all web domains web_domains = list(self.web_sites.web_sites.keys()) domains_filtered = list(w for w in web_domains if self.check_web_site_filters(w)) self.logger.info("we are going to process {} web sites".format( len(domains_filtered))) if start_selenium: TDownloadEnv.FILE_CACHE_FOLDER = TDownloadEnv.FILE_CACHE_FOLDER + "_{}_{}".format( time.time(), os.getpid()) self.logger.info("rm {}".format(TDownloadEnv.FILE_CACHE_FOLDER)) TDownloadEnv.clear_cache_folder() project_path = "project.txt" TRobotProject.create_project("dummy.ru", project_path) with TRobotProject( self.logger, project_path, export_folder="result") as self.temp_dlrobot_project: for w in domains_filtered: yield w os.unlink(project_path) else: for w in domains_filtered: yield w
def create_departments(self): o: TOfficeInMemory TDownloadEnv.clear_cache_folder() project_path = "project.txt" TRobotProject.create_project("dummy.ru", project_path, web_sites_db=self.web_sites) with TRobotProject( self.logger, project_path, export_folder="result") as self.temp_dlrobot_project: for o in self.web_sites.offices.values(): if o.parent_id == self.args.parent_office_id: self.logger.info("ofiice id = {}, {}".format( o.office_id, o.name)) query = self.args.query_template.format(o.name) engine = random.choice( [SearchEngineEnum.GOOGLE, SearchEngineEnum.YANDEX]) results = SearchEngine.send_request( engine, query, self.temp_dlrobot_project.selenium_driver) if len(results) == 0: msg = "cannot find results fo query {}".format(query) self.logger.error(msg) else: new_web_site = TDeclarationWebSite(url=results[0]) found = False for u in o.office_web_sites: if u.url == new_web_site: found = True self.logger.error( "{} already exists".format(new_web_site)) if not found: o.office_web_sites.append(new_web_site) self.check_alive_one_url(new_web_site.url) time.sleep(20)
def setup_project(self, morda_url): logger = setup_logging('prohibited') self.project = TRobotProject(logger, '', config=TRobotConfig(), export_folder="result", enable_search_engine=False) web_site = self.project.add_web_site(morda_url) self.robot_step = TRobotStep(web_site) self.env = TestDlrobotEnv("data.prohibited") TDownloadEnv.FILE_CACHE_FOLDER = self.env.data_folder
def test_unknown_site(self): self.project_path = os.path.join(self.env.data_folder, "project.txt") TRobotProject.create_project("http://unknown_site.org", self.project_path) dlrobot = TDlrobot(TDlrobot.parse_args(['--clear-cache-folder', '--project', self.project_path])) try: project = dlrobot.open_project() except THttpRequester.RobotHttpException as exp: pass self.assertEqual(project.web_site_snapshots[0].reach_status, TWebSiteReachStatus.abandoned) TDownloadEnv.CONVERSION_CLIENT.stop_conversion_thread() TDownloadEnv.CONVERSION_CLIENT = None
def copy_files(args, toloka_results): assert args.positive_folder is not None assert args.negative_folder is not None logger = logging.getLogger("") with TRobotProject(args.project) as project: project.read_project() office_info: TWebSiteCrawlSnapshot office_info = project.web_site_snapshots[0] index = 0 site_url = office_info.get_site_url() for export_record in office_info.exported_files: index += 1 cached_file = export_record['cached_file'] url = export_record['url'] print() extension = TDownloadedFile(url).file_extension out_file = "{}_{}_{}{}".format(site_url.replace('/', '_'), index, int(time.time()), extension) tol_res = toloka_results.get(cached_file) if tol_res == "YES": folder = args.positive_folder elif tol_res == "NO": folder = args.negative_folder else: folder = None if folder is not None: out_file = os.path.join(folder, out_file) print("{} -> {}".format(url, out_file)) shutil.copy(cached_file, out_file)
def create_toloka_pool(project_path, toloka_stream): logger = logging.getLogger("") with TRobotProject(logger, project_path) as project: project.read_project() office_info: TWebSiteCrawlSnapshot office_info = project.web_site_snapshots[0] toloka_stream.write( "INPUT:url\tINPUT:file_link\tINPUT:file_extension\tINPUT:html\n") ec = TExternalConverters() cnt = 0 all_files = 0 for export_record in office_info.exported_files: all_files += 1 sys.stderr.write("{}/{}\n".format(all_files, len(office_info.exported_files))) sys.stderr.flush() url = export_record['url'] cached_file = export_record['cached_file'] extension = TDownloadedFile(url).file_extension temp_file = "dummy" + extension shutil.copy(cached_file, temp_file) html = ec.convert_to_html_with_soffice(temp_file) os.unlink(temp_file) if html is not None: html = html.replace("\t", " ").replace("\n", " ").replace("\r", " ") toloka_stream.write("\t".join((url, cached_file, extension, html)) + "\n\n") cnt += 1 sys.stderr.write("written {} lines of of {}".format(cnt, all_files))
def __init__(self, port, website_folder, regional_main_pages=[]): self.dlrobot = None self.dlrobot_project = None self.web_site_folder = os.path.join(os.path.dirname(__file__), website_folder) name = os.path.basename(website_folder) self.data_folder = os.path.join(os.path.dirname(__file__), "data.{}".format(name)) self.dlrobot_result_folder = os.path.join(self.data_folder, "result") if os.path.exists(self.data_folder): shutil.rmtree(self.data_folder, ignore_errors=True) handler = partial(http.server.SimpleHTTPRequestHandler, directory=self.web_site_folder) if not is_local_http_port_free(port): for p in TTestEnv.additional_ports: if is_local_http_port_free(p): port = p break assert is_local_http_port_free(port) self.web_site = http.server.HTTPServer(server_address=("127.0.0.1", port), RequestHandlerClass=handler) os.mkdir(self.data_folder) os.chdir(self.data_folder) self.project_path = os.path.join(self.data_folder, "project.txt") regional = list("http://127.0.0.1:{}/{}".format(port, url) for url in regional_main_pages) project = TRobotProject.create_project_str("http://127.0.0.1:{}".format(port), regional_main_pages=regional, disable_search_engine=True) with open(self.project_path, "w") as outp: outp.write(project)
def add_files_of_one_project(self, dlrobot_project): self.logger.debug("process {}".format(dlrobot_project)) project_folder = os.path.join(self.args.input_dlrobot_folder, dlrobot_project) dlrobot_project_without_timestamp = re.sub('\.[0-9]+$', '', dlrobot_project) project_path = os.path.join(project_folder, dlrobot_project_without_timestamp + ".txt") if not os.path.exists(project_path): self.logger.error( "no dlrobot project file found".format(project_folder)) return try: project = TRobotProject(self.logger, project_path, config=self.dlrobot_config, web_sites_db=self.web_sites_db) project.read_project(check_step_names=False) office_info: TWebSiteCrawlSnapshot office_info = project.web_site_snapshots[0] site_url = office_info.get_site_url() exported_files = dict() for export_record in office_info.export_env.exported_files: exported_files[export_record.sha256] = export_record except Exception as exp: self.logger.error("cannot read project {}, exp={}".format( project_path, exp)) return file_info: TExportFile for sha256, file_info in exported_files.items(): web_ref = TWebReference( url=file_info.url, crawl_epoch=self.args.max_ctime, site_url=site_url, declaration_year=file_info.declaration_year) self.add_dlrobot_file(sha256, file_info.file_extension, [web_ref])
def open_project(self): self.logger.debug("hostname={}".format(platform.node())) self.logger.debug("use {} as a cache folder".format( os.path.realpath(TDownloadEnv.FILE_CACHE_FOLDER))) with TRobotProject(self.logger, self.args.project, self.config, self.args.result_folder) as project: self.logger.debug("total timeout = {}".format( self.config.get_dlrobot_total_timeout())) project.read_project() project.fetch_main_pages() if self.args.only_click_paths: project.write_export_stats() else: self.make_steps(project) project.write_export_stats() if self.args.click_features_file: project.write_click_features(self.args.click_features_file) return project
def collect_links_selenium( self, start_url, link_func=TRobotStep.looks_like_a_declaration_link, is_last_step=False): TDownloadEnv.clear_cache_folder() robot_steps = [{'step_name': "declarations"}] with TRobotProject( self.logger, "project.txt", TRobotConfig(passport_steps=robot_steps), "result", enable_search_engine=False, ) as project: project.read_project() office_info = project.web_site_snapshots[0] office_info.create_export_folder() step_info = TRobotStep(office_info, step_name="declarations", is_last_step=is_last_step) if isinstance(start_url, list): for x in start_url: step_info.pages_to_process[x] = 0 office_info.url_nodes[x] = TUrlInfo(title="", step_name=None) else: office_info.url_nodes[start_url] = TUrlInfo(title="", step_name=None) step_info.pages_to_process[start_url] = 0 step_info.processed_pages = set() step_info.apply_function_to_links(link_func) links = dict() for url, weight in step_info.url_to_weight.items(): u = list(urllib.parse.urlparse(url)) u[1] = "dummy" links[urllib.parse.urlunparse(u)] = weight for url_info in office_info.url_nodes.values(): for d in url_info.downloaded_files: links[d.downloaded_file] = 1 return links
def calc_project_stats(self, logger, web_sites_db, project_folder, config: TRobotConfig): if not self.task_ended(): return try: path = os.path.join(project_folder, self.project_file) with TRobotProject(logger, path, config=config, start_selenium=False, enable_search_engine=False, web_sites_db=web_sites_db) as project: project.read_project(check_step_names=False) web_site_snapshot = project.web_site_snapshots[0] self.result_files_count = len( web_site_snapshot.export_env.exported_files) self.reach_status = web_site_snapshot.reach_status except Exception as exp: logger.error("Cannot read file {}: exception={}".format( self.project_file, str(exp))) pass
class TestProhibitedLinksBase(TestCase): def setup_project(self, morda_url): logger = setup_logging('prohibited') self.project = TRobotProject(logger, '', config=TRobotConfig(), export_folder="result", enable_search_engine=False) web_site = self.project.add_web_site(morda_url) self.robot_step = TRobotStep(web_site) self.env = TestDlrobotEnv("data.prohibited") TDownloadEnv.FILE_CACHE_FOLDER = self.env.data_folder def tearDown(self): self.env.delete_temp_folder() def check_follow(self, src, trg, canon): if not src.startswith('http'): src = 'http://' + src if not trg.startswith('http'): trg = 'http://' + trg link_info = TLinkInfo(TClickEngine.selenium, src, trg) can_follow = self.robot_step.can_follow_this_link(link_info) self.assertEqual(canon, can_follow, msg="{} -> {}".format(src, trg))
def get_new_project_to_process(self, worker_host_name, worker_ip): site_url = self.web_sites_to_process.pop(0) project_file = TRemoteDlrobotCall.web_site_to_project_file(site_url) self.logger.info( "start job: {} on {} (host name={}), left jobs: {}, running jobs: {}" .format(project_file, worker_ip, worker_host_name, len(self.web_sites_to_process), self.get_running_jobs_count())) remote_call = TRemoteDlrobotCall(worker_ip=worker_ip, project_file=project_file, web_site=site_url) remote_call.worker_host_name = worker_host_name web_site_passport = self.web_sites_db.get_web_site(site_url) regional_main_pages = list() if web_site_passport is None: self.logger.error( "{} is not registered in the web site db, no office information is available for the site" ) project_content_str = TRobotProject.create_project_str( site_url, regional_main_pages, disable_search_engine=not self.args.enable_search_engines) self.worker_2_running_tasks[worker_ip].append(remote_call) return remote_call, project_content_str.encode("utf8")
def send_files_to_central(self, files): web_domains = list() for file_name in files: web_domain = self.args.web_domain if file_name.endswith('.html'): web_domain = self.get_url_from_meta_tag( file_name, self.args.web_domain) web_domains.append(web_domain) robot_project_path = TRobotProject.create_project_from_exported_files( self.logger, self.args.web_domain, files, web_domains) headers = { DLROBOT_HEADER_KEYS.EXIT_CODE: 0, DLROBOT_HEADER_KEYS.PROJECT_FILE: os.path.basename(robot_project_path), DLROBOT_HEADER_KEYS.WORKER_HOST_NAME: platform.node(), "Content-Type": "application/binary" } self.logger.debug( "send results back for {}".format(robot_project_path)) dlrobot_results_file_name = os.path.basename( robot_project_path) + ".tar.gz" project_folder = self.args.web_domain with tarfile.open(dlrobot_results_file_name, "w:gz") as tar: for f in os.listdir(project_folder): tar.add(os.path.join(project_folder, f), arcname=f) self.logger.debug("created file {} size={}".format( dlrobot_results_file_name, os.stat(dlrobot_results_file_name).st_size)) max_send_try_count = 3 for try_id in range(max_send_try_count): conn = None try: conn = http.client.HTTPConnection( self.args.server_address, timeout=self.args.http_put_timeout) with open(dlrobot_results_file_name, "rb") as inp: self.logger.debug("put file {} to {}".format( dlrobot_results_file_name, self.args.server_address)) conn.request("PUT", dlrobot_results_file_name, inp.read(), headers=headers) response = conn.getresponse() conn.close() conn = None self.logger.debug( "sent dlrobot result file {}, size={}, http_code={}". format(dlrobot_results_file_name, os.stat(dlrobot_results_file_name).st_size, response.status)) break except Exception as exc: self.logger.error('worker got {}'.format(type(exc).__name__)) self.logger.error('try_id = {} out of {}'.format( try_id, max_send_try_count)) if conn is not None: conn.close() if try_id == max_send_try_count - 1: self.logger.debug( "give up, we cannot send the results back, so the results are useless" ) else: sleep_seconds = (try_id + 1) * 180 self.logger.debug( 'sleep for {} seconds'.format(sleep_seconds)) time.sleep(sleep_seconds) self.logger.debug("delete file {}".format(dlrobot_results_file_name)) os.unlink(dlrobot_results_file_name) shutil.rmtree(project_folder, ignore_errors=True) time.sleep(self.args.wait_after_each_doc * len(files))