def send_results_back(self, project_file, exitcode): project_folder = os.path.dirname(project_file) headers = { DLROBOT_HEADER_KEYS.EXIT_CODE: exitcode, DLROBOT_HEADER_KEYS.PROJECT_FILE: TUrlUtf8Encode.to_idna(os.path.basename(project_file)), DLROBOT_HEADER_KEYS.WORKER_HOST_NAME: platform.node(), "Content-Type": "application/binary" } self.logger.debug("send results back for {} exitcode={}".format(project_file, exitcode)) dlrobot_results_file_name = os.path.basename(project_file) + ".tar.gz" with tarfile.open(dlrobot_results_file_name, "w:gz") as tar: for f in os.listdir(project_folder): tar.add(os.path.join(project_folder, f), arcname=f) self.logger.debug( "created file {} size={}".format(dlrobot_results_file_name, os.stat(dlrobot_results_file_name).st_size)) max_send_try_count = 3 for try_id in range(max_send_try_count): conn = None try: conn = http.client.HTTPConnection(self.args.server_address, timeout=self.args.http_put_timeout) with open(dlrobot_results_file_name, "rb") as inp: self.logger.debug("put file {} to {}".format(dlrobot_results_file_name, self.args.server_address)) conn.request("PUT", TUrlUtf8Encode.to_idna(dlrobot_results_file_name), inp.read(), headers=headers) response = conn.getresponse() conn.close() conn = None self.logger.debug("sent dlrobot result file {}, exitcode={}. size={}, http_code={}".format( dlrobot_results_file_name, exitcode, os.stat(dlrobot_results_file_name).st_size, response.status)) break except Exception as exc: self.logger.error('worker got {}'.format(type(exc).__name__)) self.logger.error('try_id = {} out of {}'.format(try_id, max_send_try_count)) if conn is not None: conn.close() if try_id == max_send_try_count - 1: self.logger.debug("give up, we cannot send the results back, so the results are useless") else: sleep_seconds = (try_id + 1) * 180 self.logger.debug('sleep for {} seconds'.format(sleep_seconds)) time.sleep(sleep_seconds) self.logger.debug("delete file {}".format(dlrobot_results_file_name)) os.unlink(dlrobot_results_file_name) if self.args.delete_dlrobot_results: shutil.rmtree(project_folder, ignore_errors=True)
def to_utf8(self): cnt = 0 for site_url in self.get_url_list(): site_info = self.web_sites.get_web_site(site_url) if site_info.redirect_to is not None and TUrlUtf8Encode.is_idna_string( site_info.redirect_to): site_info.redirect_to = TUrlUtf8Encode.convert_url_from_idna( site_info.redirect_to) if site_info.redirect_to == site_url and site_info.reach_status == TWebSiteReachStatus.abandoned: site_info.redirect_to = None site_info.reach_status = TWebSiteReachStatus.normal cnt += 1 if TUrlUtf8Encode.is_idna_string(site_url): site_info.url = TUrlUtf8Encode.convert_url_from_idna(site_url) cnt += 1 self.logger.info("{} conversions made".format(cnt))
def _prepare_url_before_http_request(url, method): THttpRequester.consider_request_policy(url, method) url = TUrlUtf8Encode.convert_url_to_idna(url) o = urlsplit_pro(url) path = urllib.parse.unquote(o.path) path = urllib.parse.quote(path) url = urllib.parse.urlunsplit( (o.scheme, o.netloc, path, o.query, o.fragment)) return url
def add_web_site(self, site_url: str): # russian domain must be in utf8 assert not TUrlUtf8Encode.is_idna_string(site_url) assert site_url.startswith("http") for x in self.office_web_sites: assert x.url != site_url s = TDeclarationWebSite(parent_office=self) s.url = site_url self.office_web_sites.append(s)
def test_idna_url(self): def check(s): idna = TUrlUtf8Encode.convert_url_to_idna(s) s1 = TUrlUtf8Encode.convert_url_from_idna(idna) self.assertEqual(s, s1) check("дом.рф/html.html") check("http://дом.рф/html.html") check("http://дом.рф") self.assertEqual(TUrlUtf8Encode.convert_url_from_idna('xn--80agabx3af.xn--p1ai'), 'дагони.рф')
def get_new_task_job(self): conn = http.client.HTTPConnection(self.args.server_address) headers = { DLROBOT_HEADER_KEYS.WORKER_HOST_NAME: platform.node(), } conn.request("GET", "?authorization_code=456788", headers=headers) response = conn.getresponse() conn.close() if response.status != http.HTTPStatus.OK: if response.status != DLROBOT_HTTP_CODE.NO_MORE_JOBS: self.logger.error("cannot get a new project from dlrobot central, httpcode={}".format( response.status )) raise DlrobotWorkerException() project_file = TUrlUtf8Encode.from_idna(response.getheader(DLROBOT_HEADER_KEYS.PROJECT_FILE)) if project_file is None: self.logger.error("cannot find header {}".format(DLROBOT_HEADER_KEYS.PROJECT_FILE)) raise DlrobotWorkerException() dlrobot_config_type = response.getheader(DLROBOT_HEADER_KEYS.DLROBOT_CONFIG_TYPE) if dlrobot_config_type is None: self.logger.error("cannot find header {}".format(DLROBOT_HEADER_KEYS.DLROBOT_CONFIG_TYPE)) raise DlrobotWorkerException() config = TRobotConfig.read_by_config_type(dlrobot_config_type) file_data = response.read() self.logger.debug("get task {} size={}".format(project_file, len(file_data))) basename_project_file = os.path.basename(project_file) folder, _ = os.path.splitext(basename_project_file) if os.path.exists(folder): shutil.rmtree(folder, ignore_errors=True) self.logger.debug("mkdir {}".format(folder)) os.makedirs(folder, exist_ok=True) self.logger.debug("write {} to {}".format(basename_project_file, folder)) project_file = os.path.join(folder, basename_project_file) with open(project_file, "wb") as outp: outp.write(file_data) with open(os.path.join(folder, TIMEOUT_FILE_PATH), "w") as outp: outp.write("{}".format(int(time.time()) + config.get_timeout_to_delete_files_in_worker())) return project_file, config
def check(s): idna = TUrlUtf8Encode.to_idna(s) s1 = TUrlUtf8Encode.from_idna(idna) self.assertEqual(s, s1)
def test_idna_url(self): s = "https://xn----7sbabb9bafefpyi3bm2b9a2gra.xn--p1ai/a.href" u = TUrlUtf8Encode.convert_url_from_idna(s) self.assertEqual("https://батайск-официальный.рф/a.href", u)
def test_idna_exception(self): bad_idna_string = ".bad_domain" # error in encoding s = TUrlUtf8Encode.to_idna(bad_idna_string) self.assertEqual(s, bad_idna_string)
def check(s): idna = TUrlUtf8Encode.convert_url_to_idna(s) s1 = TUrlUtf8Encode.convert_url_from_idna(idna) self.assertEqual(s, s1)
def convert_to_utf8(self): self._site_url = TUrlUtf8Encode.convert_if_idna(self._site_url)
def do_PUT(self): def send_error(message, http_code=http.HTTPStatus.BAD_REQUEST): self.server.logger.error(message) http.server.SimpleHTTPRequestHandler.send_error( self, http_code, message) if self.path is None: send_error("no file specified") return file_length = self.headers.get('Content-Length') if file_length is None or not file_length.isdigit(): send_error('cannot find header Content-Length') return file_length = int(file_length) project_file = TUrlUtf8Encode.from_idna( self.headers.get(DLROBOT_HEADER_KEYS.PROJECT_FILE)) if project_file is None: send_error('cannot find header "{}"'.format( DLROBOT_HEADER_KEYS.PROJECT_FILE)) return exitcode = self.headers.get(DLROBOT_HEADER_KEYS.EXIT_CODE) if exitcode is None or not exitcode.isdigit(): send_error('missing exitcode or bad exit code') return worker_host_name = self.headers.get( DLROBOT_HEADER_KEYS.WORKER_HOST_NAME) if worker_host_name is None: send_error('cannot find header "{]'.format( DLROBOT_HEADER_KEYS.WORKER_HOST_NAME)) return worker_ip = self.client_address[0] self.server.logger.debug( "start reading file {} file size {} from {}".format( project_file, file_length, worker_ip)) try: archive_file_bytes = self.rfile.read(file_length) except Exception as exp: send_error('file reading failed: {}'.format(str(exp))) return try: self.server.register_task_result(worker_host_name, worker_ip, project_file, int(exitcode), archive_file_bytes) except Exception as exp: send_error('register_task_result failed: {}'.format(str(exp))) self.server.register_task_result_error_count += 1 if self.server.register_task_result_error_count % 10 == 0: self.server.send_to_telegram( "dlrobot_central: register_task_result_error_count: {}". format(self.server.register_task_result_error_count)) return self.send_response(http.HTTPStatus.CREATED) self.end_headers()
def do_GET(self): def send_error(message, http_code=http.HTTPStatus.BAD_REQUEST, log_error=True): if log_error: self.server.logger.error(message) http.server.SimpleHTTPRequestHandler.send_error( self, http_code, message) query_components = dict() if not self.parse_cgi(query_components): send_error('bad request', log_error=False) return try: if self.process_special_commands(): return except Exception as exp: self.server.logger.error(exp) return dummy_code = query_components.get('authorization_code', None) if not dummy_code: send_error('No authorization_code provided', log_error=False) return if not self.server.have_tasks(): send_error("no more jobs", DLROBOT_HTTP_CODE.NO_MORE_JOBS) return worker_host_name = self.headers.get( DLROBOT_HEADER_KEYS.WORKER_HOST_NAME) if worker_host_name is None: send_error('cannot find header {}'.format( DLROBOT_HEADER_KEYS.WORKER_HOST_NAME)) return if not self.server.check_pdf_conversion_server(): send_error("pdf conversion server is too busy", DLROBOT_HTTP_CODE.TOO_BUSY) return worker_ip = self.client_address[0] if self.server.worker_is_banned(worker_ip, worker_host_name): error_msg = "too many dlrobot errors from ip {} hostname {}".format( worker_ip, worker_host_name) send_error(error_msg, DLROBOT_HTTP_CODE.TOO_BUSY) return try: remote_call, project_content = self.server.get_new_project_to_process( worker_host_name, worker_ip) except Exception as exp: self.server.error.logger( "Cannot send project, exception = {}".format(exp)) send_error(str(exp)) return self.send_response(200) self.send_header(DLROBOT_HEADER_KEYS.PROJECT_FILE, TUrlUtf8Encode.to_idna(remote_call.project_file)) self.send_header(DLROBOT_HEADER_KEYS.DLROBOT_CONFIG_TYPE, self.server.config.config_type) self.end_headers() self.wfile.write(project_content)