def send_results_back(self, project_file, exitcode):
        project_folder = os.path.dirname(project_file)
        headers = {
            DLROBOT_HEADER_KEYS.EXIT_CODE: exitcode,
            DLROBOT_HEADER_KEYS.PROJECT_FILE: TUrlUtf8Encode.to_idna(os.path.basename(project_file)),
            DLROBOT_HEADER_KEYS.WORKER_HOST_NAME: platform.node(),
            "Content-Type": "application/binary"
        }
        self.logger.debug("send results back for {} exitcode={}".format(project_file, exitcode))
        dlrobot_results_file_name = os.path.basename(project_file) + ".tar.gz"

        with tarfile.open(dlrobot_results_file_name, "w:gz") as tar:
            for f in os.listdir(project_folder):
                tar.add(os.path.join(project_folder, f), arcname=f)

        self.logger.debug(
            "created file {} size={}".format(dlrobot_results_file_name, os.stat(dlrobot_results_file_name).st_size))

        max_send_try_count = 3
        for try_id in range(max_send_try_count):
            conn = None
            try:
                conn = http.client.HTTPConnection(self.args.server_address, timeout=self.args.http_put_timeout)
                with open(dlrobot_results_file_name, "rb") as inp:
                    self.logger.debug("put file {} to {}".format(dlrobot_results_file_name, self.args.server_address))
                    conn.request("PUT", TUrlUtf8Encode.to_idna(dlrobot_results_file_name), inp.read(), headers=headers)
                    response = conn.getresponse()
                    conn.close()
                    conn = None
                    self.logger.debug("sent dlrobot result file {}, exitcode={}. size={}, http_code={}".format(
                        dlrobot_results_file_name,
                        exitcode,
                        os.stat(dlrobot_results_file_name).st_size,
                        response.status))
                    break
            except Exception as exc:
                self.logger.error('worker got {}'.format(type(exc).__name__))
                self.logger.error('try_id = {} out of {}'.format(try_id, max_send_try_count))
                if conn is not None:
                    conn.close()
                if try_id == max_send_try_count - 1:
                    self.logger.debug("give up, we cannot send the results back, so the results are useless")
                else:
                    sleep_seconds = (try_id + 1) * 180
                    self.logger.debug('sleep for {} seconds'.format(sleep_seconds))
                    time.sleep(sleep_seconds)

        self.logger.debug("delete file {}".format(dlrobot_results_file_name))
        os.unlink(dlrobot_results_file_name)

        if self.args.delete_dlrobot_results:
            shutil.rmtree(project_folder, ignore_errors=True)
Exemple #2
0
 def to_utf8(self):
     cnt = 0
     for site_url in self.get_url_list():
         site_info = self.web_sites.get_web_site(site_url)
         if site_info.redirect_to is not None and TUrlUtf8Encode.is_idna_string(
                 site_info.redirect_to):
             site_info.redirect_to = TUrlUtf8Encode.convert_url_from_idna(
                 site_info.redirect_to)
             if site_info.redirect_to == site_url and site_info.reach_status == TWebSiteReachStatus.abandoned:
                 site_info.redirect_to = None
                 site_info.reach_status = TWebSiteReachStatus.normal
             cnt += 1
         if TUrlUtf8Encode.is_idna_string(site_url):
             site_info.url = TUrlUtf8Encode.convert_url_from_idna(site_url)
             cnt += 1
     self.logger.info("{} conversions made".format(cnt))
 def _prepare_url_before_http_request(url, method):
     THttpRequester.consider_request_policy(url, method)
     url = TUrlUtf8Encode.convert_url_to_idna(url)
     o = urlsplit_pro(url)
     path = urllib.parse.unquote(o.path)
     path = urllib.parse.quote(path)
     url = urllib.parse.urlunsplit(
         (o.scheme, o.netloc, path, o.query, o.fragment))
     return url
 def add_web_site(self, site_url: str):
     # russian domain must be in utf8
     assert not TUrlUtf8Encode.is_idna_string(site_url)
     assert site_url.startswith("http")
     for x in self.office_web_sites:
         assert x.url != site_url
     s = TDeclarationWebSite(parent_office=self)
     s.url = site_url
     self.office_web_sites.append(s)
    def test_idna_url(self):
        def check(s):
            idna = TUrlUtf8Encode.convert_url_to_idna(s)
            s1 = TUrlUtf8Encode.convert_url_from_idna(idna)
            self.assertEqual(s, s1)
        check("дом.рф/html.html")
        check("http://дом.рф/html.html")
        check("http://дом.рф")

        self.assertEqual(TUrlUtf8Encode.convert_url_from_idna('xn--80agabx3af.xn--p1ai'), 'дагони.рф')
    def get_new_task_job(self):
        conn = http.client.HTTPConnection(self.args.server_address)
        headers = {
            DLROBOT_HEADER_KEYS.WORKER_HOST_NAME: platform.node(),
        }
        conn.request("GET", "?authorization_code=456788", headers=headers)
        response = conn.getresponse()
        conn.close()
        if response.status != http.HTTPStatus.OK:
            if response.status != DLROBOT_HTTP_CODE.NO_MORE_JOBS:
                self.logger.error("cannot get a new project from dlrobot central, httpcode={}".format(
                    response.status
                ))
            raise DlrobotWorkerException()
        project_file = TUrlUtf8Encode.from_idna(response.getheader(DLROBOT_HEADER_KEYS.PROJECT_FILE))
        if project_file is None:
            self.logger.error("cannot find header {}".format(DLROBOT_HEADER_KEYS.PROJECT_FILE))
            raise DlrobotWorkerException()
        dlrobot_config_type = response.getheader(DLROBOT_HEADER_KEYS.DLROBOT_CONFIG_TYPE)
        if dlrobot_config_type is None:
            self.logger.error("cannot find header {}".format(DLROBOT_HEADER_KEYS.DLROBOT_CONFIG_TYPE))
            raise DlrobotWorkerException()
        config = TRobotConfig.read_by_config_type(dlrobot_config_type)
        file_data = response.read()
        self.logger.debug("get task {} size={}".format(project_file, len(file_data)))
        basename_project_file = os.path.basename(project_file)
        folder, _ = os.path.splitext(basename_project_file)

        if os.path.exists(folder):
            shutil.rmtree(folder, ignore_errors=True)
        self.logger.debug("mkdir {}".format(folder))
        os.makedirs(folder, exist_ok=True)

        self.logger.debug("write {}  to  {}".format(basename_project_file, folder))
        project_file = os.path.join(folder, basename_project_file)
        with open(project_file, "wb") as outp:
            outp.write(file_data)
        with open(os.path.join(folder, TIMEOUT_FILE_PATH), "w") as outp:
            outp.write("{}".format(int(time.time()) + config.get_timeout_to_delete_files_in_worker()))

        return project_file, config
 def check(s):
     idna = TUrlUtf8Encode.to_idna(s)
     s1 = TUrlUtf8Encode.from_idna(idna)
     self.assertEqual(s, s1)
 def test_idna_url(self):
     s = "https://xn----7sbabb9bafefpyi3bm2b9a2gra.xn--p1ai/a.href"
     u = TUrlUtf8Encode.convert_url_from_idna(s)
     self.assertEqual("https://батайск-официальный.рф/a.href", u)
 def test_idna_exception(self):
     bad_idna_string = ".bad_domain"  # error in encoding
     s = TUrlUtf8Encode.to_idna(bad_idna_string)
     self.assertEqual(s, bad_idna_string)
 def check(s):
     idna = TUrlUtf8Encode.convert_url_to_idna(s)
     s1 = TUrlUtf8Encode.convert_url_from_idna(idna)
     self.assertEqual(s, s1)
 def convert_to_utf8(self):
     self._site_url = TUrlUtf8Encode.convert_if_idna(self._site_url)
Exemple #12
0
    def do_PUT(self):
        def send_error(message, http_code=http.HTTPStatus.BAD_REQUEST):
            self.server.logger.error(message)
            http.server.SimpleHTTPRequestHandler.send_error(
                self, http_code, message)

        if self.path is None:
            send_error("no file specified")
            return

        file_length = self.headers.get('Content-Length')
        if file_length is None or not file_length.isdigit():
            send_error('cannot find header  Content-Length')
            return
        file_length = int(file_length)

        project_file = TUrlUtf8Encode.from_idna(
            self.headers.get(DLROBOT_HEADER_KEYS.PROJECT_FILE))
        if project_file is None:
            send_error('cannot find header "{}"'.format(
                DLROBOT_HEADER_KEYS.PROJECT_FILE))
            return

        exitcode = self.headers.get(DLROBOT_HEADER_KEYS.EXIT_CODE)
        if exitcode is None or not exitcode.isdigit():
            send_error('missing exitcode or bad exit code')
            return

        worker_host_name = self.headers.get(
            DLROBOT_HEADER_KEYS.WORKER_HOST_NAME)
        if worker_host_name is None:
            send_error('cannot find header "{]'.format(
                DLROBOT_HEADER_KEYS.WORKER_HOST_NAME))
            return

        worker_ip = self.client_address[0]
        self.server.logger.debug(
            "start reading file {} file size {} from {}".format(
                project_file, file_length, worker_ip))

        try:
            archive_file_bytes = self.rfile.read(file_length)
        except Exception as exp:
            send_error('file reading failed: {}'.format(str(exp)))
            return

        try:
            self.server.register_task_result(worker_host_name,
                                             worker_ip, project_file,
                                             int(exitcode), archive_file_bytes)
        except Exception as exp:
            send_error('register_task_result failed: {}'.format(str(exp)))
            self.server.register_task_result_error_count += 1
            if self.server.register_task_result_error_count % 10 == 0:
                self.server.send_to_telegram(
                    "dlrobot_central: register_task_result_error_count: {}".
                    format(self.server.register_task_result_error_count))
            return

        self.send_response(http.HTTPStatus.CREATED)
        self.end_headers()
Exemple #13
0
    def do_GET(self):
        def send_error(message,
                       http_code=http.HTTPStatus.BAD_REQUEST,
                       log_error=True):
            if log_error:
                self.server.logger.error(message)
            http.server.SimpleHTTPRequestHandler.send_error(
                self, http_code, message)

        query_components = dict()
        if not self.parse_cgi(query_components):
            send_error('bad request', log_error=False)
            return

        try:
            if self.process_special_commands():
                return
        except Exception as exp:
            self.server.logger.error(exp)
            return

        dummy_code = query_components.get('authorization_code', None)
        if not dummy_code:
            send_error('No authorization_code provided', log_error=False)
            return

        if not self.server.have_tasks():
            send_error("no more jobs", DLROBOT_HTTP_CODE.NO_MORE_JOBS)
            return

        worker_host_name = self.headers.get(
            DLROBOT_HEADER_KEYS.WORKER_HOST_NAME)
        if worker_host_name is None:
            send_error('cannot find header {}'.format(
                DLROBOT_HEADER_KEYS.WORKER_HOST_NAME))
            return

        if not self.server.check_pdf_conversion_server():
            send_error("pdf conversion server is too busy",
                       DLROBOT_HTTP_CODE.TOO_BUSY)
            return

        worker_ip = self.client_address[0]

        if self.server.worker_is_banned(worker_ip, worker_host_name):
            error_msg = "too many dlrobot errors from ip {} hostname {}".format(
                worker_ip, worker_host_name)
            send_error(error_msg, DLROBOT_HTTP_CODE.TOO_BUSY)
            return

        try:
            remote_call, project_content = self.server.get_new_project_to_process(
                worker_host_name, worker_ip)
        except Exception as exp:
            self.server.error.logger(
                "Cannot send project, exception = {}".format(exp))
            send_error(str(exp))
            return

        self.send_response(200)
        self.send_header(DLROBOT_HEADER_KEYS.PROJECT_FILE,
                         TUrlUtf8Encode.to_idna(remote_call.project_file))
        self.send_header(DLROBOT_HEADER_KEYS.DLROBOT_CONFIG_TYPE,
                         self.server.config.config_type)
        self.end_headers()
        self.wfile.write(project_content)