예제 #1
0
    def save_file(self,
                  file_bytes,
                  file_extension,
                  aux_params=None,
                  force=False,
                  sha256=None):
        if self.read_only:
            self.logger.error(
                "cannot save file since the db is opened in read-only mode")
            return
        if sha256 is None:
            sha256 = build_dislosures_sha256_by_file_data(
                file_bytes, file_extension)
        if not force and self.saved_file_params.get(sha256) is not None:
            return
        output_bin_file = self.bin_files[-1]
        if output_bin_file.tell() > self.max_bin_file_size:
            self.create_new_bin_file()
            output_bin_file = self.bin_files[-1]
        try:
            self.write_repeat_header_to_bin_file(file_bytes, file_extension,
                                                 output_bin_file)
        except IOError as exp:
            self.logger.error(
                "cannot write repeat header for {} to {}, exception:{}".format(
                    sha256, output_bin_file.name, exp))
            raise
        try:
            file_position = output_bin_file.tell()
            output_bin_file.write(file_bytes)
            output_bin_file.flush()
        except IOError as exp:
            self.logger.error(
                "cannot write file {}{} (size {}) to {}, exception:{}".format(
                    sha256, file_extension, len(file_bytes),
                    output_bin_file.name, exp))
            raise

        try:
            info = TStoredFileInfo(file_position=file_position,
                                   file_no=len(self.bin_files) - 1,
                                   file_size=len(file_bytes),
                                   file_extension=file_extension,
                                   aux_params=aux_params)
            self.write_key_to_dbm(sha256, info.write_to_string())
        except Exception as exp:
            self.logger.error(
                "cannot add file info {} to {}, exception:{}".format(
                    sha256, self.dbm_path, exp))
            raise

        self.logger.debug("put {}{} (size={}) to bin file {}".format(
            sha256, file_extension, len(file_bytes),
            len(self.bin_files) - 1))
        self.update_stats(len(file_bytes))
    def save_file(self,
                  file_bytes,
                  file_extension,
                  aux_params=None,
                  force=False,
                  sha256=None):
        if sha256 is None:
            sha256 = build_dislosures_sha256_by_file_data(
                file_bytes, file_extension)
        if not force and self.saved_file_params.get(sha256) is not None:
            return
        output_bin_file = self.bin_files[-1]
        if self.output_bin_file_size > self.max_bin_file_size:
            self.create_new_bin_file()
            self.save_stats()
            output_bin_file = self.bin_files[-1]
        try:
            bytes_count = self.write_repeat_header_to_bin_file(
                file_bytes, file_extension, output_bin_file)
            self.output_bin_file_size += bytes_count
        except IOError as exp:
            self.logger.error(
                "cannot write repeat header for {} to {}, exception:{}".format(
                    sha256, output_bin_file.name, exp))
            raise
        try:
            start_file_pos = self.output_bin_file_size
            output_bin_file.write(file_bytes)
            output_bin_file.flush()
            self.output_bin_file_size += len(file_bytes)
            assert output_bin_file.tell() == self.output_bin_file_size
        except IOError as exp:
            self.logger.error(
                "cannot write file {}{} (size {}) to {}, exception:{}".format(
                    sha256, file_extension, len(file_bytes),
                    output_bin_file.name, exp))
            raise

        try:
            params = TStoredFileParams(bin_file_index=len(self.bin_files) - 1,
                                       file_offset_in_bin_file=start_file_pos,
                                       file_size=len(file_bytes),
                                       file_extension=file_extension,
                                       aux_params=str(aux_params))
            self.write_key_to_header(sha256, params.to_string())
        except Exception as exp:
            self.logger.error(
                "cannot add file info {} to {}, exception:{}".format(
                    sha256, self.header_file_path, exp))
            raise

        self.logger.debug("put {}{} (size={}) to bin file {}".format(
            sha256, file_extension, len(file_bytes),
            len(self.bin_files) - 1))
        self.update_stats(len(file_bytes))
예제 #3
0
    def put_to_task_queue(self, file_bytes, file_extension, rebuild=False):
        sha256 = build_dislosures_sha256_by_file_data(file_bytes, file_extension)
        file_name = os.path.join(self.args.input_task_directory, sha256 + file_extension)
        if os.path.exists(file_name):
            self.logger.debug("file {} already exists in the input queue".format(file_name))
            return
        key = self.build_key(sha256, None)
        if not rebuild:
            if self.json_cache_dbm.get(key) is not None:
                self.logger.debug("file {} already exists in the db".format(file_name))
                return

        if not self.check_file_extension(str(file_name)):
            self.logger.debug("bad file extension  {}".format(file_name))
            return

        with open (file_name, "wb") as outp:
            outp.write(file_bytes)
        self.task_queue.put(os.path.basename(file_name))
        self.logger.debug("put {} to queue".format(file_name))
예제 #4
0
 def check_storage(self, fail_fast):
     i = 0
     errors_count = 0
     for key in self.get_all_keys():
         i += 1
         if (i % 100) == 0:
             self.logger.debug("file N {}".format(i))
         info = self.get_saved_file(key)
         sha256 = build_dislosures_sha256_by_file_data(
             info.file_contents, info.file_extension)
         if sha256 != key:
             errors_count += 1
             self.logger.error(
                 "key {} has invalid data, length={}, file_extension={}, bin_file_index={}"
                 .format(key, info.file_size, info.file_extension,
                         info.file_no))
             if fail_fast:
                 self.logger.error("stop checking")
                 return False
     self.logger.info("checked {} documents, errors_count={}".format(
         i, errors_count))
     return errors_count == 0
예제 #5
0
    def _do_PUT(self):
        if self.path is None:
            self.send_error_404("no file specified")
            return
        action = os.path.dirname(self.path)
        query_components = dict()
        if not self.parse_cgi(query_components):
            self.send_error_404('bad request')
            return

        _, file_extension = os.path.splitext(os.path.basename(self.path))
        if file_extension.find('?') != -1:
            file_extension = file_extension[:file_extension.find('?')]
        action = action.strip('//')
        if action == "convert_if_absent":
            rebuild = False
        elif action == "convert_mandatory":
            rebuild = True
        else:
            self.send_error_404(
                "bad action (file path), can be 'convert_mandatory' or 'convert_if_absent', got \"{}\""
                .format(action))
            return
        if file_extension not in ALLOWED_FILE_EXTENSTIONS:
            self.send_error_404("bad file extension: {}, can be {}".format(
                file_extension, ALLOWED_FILE_EXTENSTIONS))
            return

        file_length = int(self.headers['Content-Length'])
        max_file_size = 2**25
        if file_length > max_file_size:
            self.send_error_404(
                "file is too large (size must less than {} bytes ".format(
                    max_file_size))
            return
        self.server.logger.debug("receive file {} length {}".format(
            self.path, file_length))
        try:
            file_bytes = self.rfile.read(file_length)
        except ConnectionError as exp:
            self.send_error_404("ConnectionError : {}".format(exp))
            return

        sha256 = build_dislosures_sha256_by_file_data(file_bytes,
                                                      file_extension)
        if not rebuild and self.server.convert_storage.has_converted_file(
                sha256):
            self.send_response(201, 'Already exists')
            self.end_headers()
            return
        if not self.server.save_new_file(
                sha256,
                file_bytes,
                file_extension,
                rebuild,
                only_winword_conversion=query_components.get(
                    "only_winword_conversion", False),
                only_ocr=query_components.get("only_ocr", False)):
            self.send_response(
                201, 'Already registered as a conversion task, wait ')
            self.end_headers()
            return

        self.server.all_put_files_count += 1

        self.send_response(201, 'Cre    ated')
        self.end_headers()
예제 #6
0
import os
import sys

if __name__ == '__main__':
    folder = sys.argv[1]
    for x in os.listdir(folder):
        if x.endswith(".pdf") and len(x) < 20:
            pdf_file = os.path.join(folder, x)
            docx_file = os.path.join(folder, x + ".docx")
            if not os.path.exists(docx_file):
                sys.stderr.write("cannot find {}\n".format(docx_file))
                sys.stderr.write("delete {}\n".format(pdf_file))
                os.unlink(pdf_file)
                continue
            sha256hash = build_dislosures_sha256_by_file_data(pdf_file)
            new_pdf_file = os.path.join(folder, sha256hash + ".pdf")
            if os.path.exists(new_pdf_file):
                sys.stderr.write(
                    "{} already exists, skip renaming\n".format(pdf_file))
                sys.stderr.write("delete {}\n".format(pdf_file))
                os.unlink(pdf_file)
                sys.stderr.write("delete {}\n".format(docx_file))
                os.unlink(docx_file)
                continue

            sys.stderr.write("rename {} to {}\n".format(
                pdf_file, new_pdf_file))
            os.rename(pdf_file, new_pdf_file)
            os.rename(docx_file, new_pdf_file + ".docx")
예제 #7
0
    html_extension = '.html'
    #html_extension = '.docx'
    html_keys = list()
    while key is not None:
        value = source_doc_db.saved_file_params[key].decode('utf8')
        if value.find(';{};'.format(html_extension)) != -1:
            html_keys.append(key)
        key = source_doc_db.saved_file_params.nextkey(key)

    logger.info("found {} html keys".format(len(html_keys)))

    for key in html_keys:
        value = source_doc_db.saved_file_params[key].decode('utf8')
        html_data, file_extension = source_doc_db.get_saved_file(key)
        assert html_extension == file_extension
        new_sha256 = build_dislosures_sha256_by_file_data(
            html_data, html_extension)
        logger.debug("{} -> {}".format(key, new_sha256))
        source_doc_db.saved_file_params[
            new_sha256] = source_doc_db.saved_file_params[key]
        del source_doc_db.saved_file_params[key]

        key_utf8 = key.decode('utf8')
        old_smart_parser_key = "{},{}".format(key_utf8, 0.1)
        new_smart_parser_key = "{},{}".format(new_sha256, 0.1)
        if old_smart_parser_key in smart_parser_dbm:
            logger.debug("{} -> {}".format(old_smart_parser_key,
                                           new_smart_parser_key))
            smart_parser_dbm[new_smart_parser_key] = smart_parser_dbm[
                old_smart_parser_key]
            del smart_parser_dbm[old_smart_parser_key]
        else: