def save_file(self, file_bytes, file_extension, aux_params=None, force=False, sha256=None): if self.read_only: self.logger.error( "cannot save file since the db is opened in read-only mode") return if sha256 is None: sha256 = build_dislosures_sha256_by_file_data( file_bytes, file_extension) if not force and self.saved_file_params.get(sha256) is not None: return output_bin_file = self.bin_files[-1] if output_bin_file.tell() > self.max_bin_file_size: self.create_new_bin_file() output_bin_file = self.bin_files[-1] try: self.write_repeat_header_to_bin_file(file_bytes, file_extension, output_bin_file) except IOError as exp: self.logger.error( "cannot write repeat header for {} to {}, exception:{}".format( sha256, output_bin_file.name, exp)) raise try: file_position = output_bin_file.tell() output_bin_file.write(file_bytes) output_bin_file.flush() except IOError as exp: self.logger.error( "cannot write file {}{} (size {}) to {}, exception:{}".format( sha256, file_extension, len(file_bytes), output_bin_file.name, exp)) raise try: info = TStoredFileInfo(file_position=file_position, file_no=len(self.bin_files) - 1, file_size=len(file_bytes), file_extension=file_extension, aux_params=aux_params) self.write_key_to_dbm(sha256, info.write_to_string()) except Exception as exp: self.logger.error( "cannot add file info {} to {}, exception:{}".format( sha256, self.dbm_path, exp)) raise self.logger.debug("put {}{} (size={}) to bin file {}".format( sha256, file_extension, len(file_bytes), len(self.bin_files) - 1)) self.update_stats(len(file_bytes))
def save_file(self, file_bytes, file_extension, aux_params=None, force=False, sha256=None): if sha256 is None: sha256 = build_dislosures_sha256_by_file_data( file_bytes, file_extension) if not force and self.saved_file_params.get(sha256) is not None: return output_bin_file = self.bin_files[-1] if self.output_bin_file_size > self.max_bin_file_size: self.create_new_bin_file() self.save_stats() output_bin_file = self.bin_files[-1] try: bytes_count = self.write_repeat_header_to_bin_file( file_bytes, file_extension, output_bin_file) self.output_bin_file_size += bytes_count except IOError as exp: self.logger.error( "cannot write repeat header for {} to {}, exception:{}".format( sha256, output_bin_file.name, exp)) raise try: start_file_pos = self.output_bin_file_size output_bin_file.write(file_bytes) output_bin_file.flush() self.output_bin_file_size += len(file_bytes) assert output_bin_file.tell() == self.output_bin_file_size except IOError as exp: self.logger.error( "cannot write file {}{} (size {}) to {}, exception:{}".format( sha256, file_extension, len(file_bytes), output_bin_file.name, exp)) raise try: params = TStoredFileParams(bin_file_index=len(self.bin_files) - 1, file_offset_in_bin_file=start_file_pos, file_size=len(file_bytes), file_extension=file_extension, aux_params=str(aux_params)) self.write_key_to_header(sha256, params.to_string()) except Exception as exp: self.logger.error( "cannot add file info {} to {}, exception:{}".format( sha256, self.header_file_path, exp)) raise self.logger.debug("put {}{} (size={}) to bin file {}".format( sha256, file_extension, len(file_bytes), len(self.bin_files) - 1)) self.update_stats(len(file_bytes))
def put_to_task_queue(self, file_bytes, file_extension, rebuild=False): sha256 = build_dislosures_sha256_by_file_data(file_bytes, file_extension) file_name = os.path.join(self.args.input_task_directory, sha256 + file_extension) if os.path.exists(file_name): self.logger.debug("file {} already exists in the input queue".format(file_name)) return key = self.build_key(sha256, None) if not rebuild: if self.json_cache_dbm.get(key) is not None: self.logger.debug("file {} already exists in the db".format(file_name)) return if not self.check_file_extension(str(file_name)): self.logger.debug("bad file extension {}".format(file_name)) return with open (file_name, "wb") as outp: outp.write(file_bytes) self.task_queue.put(os.path.basename(file_name)) self.logger.debug("put {} to queue".format(file_name))
def check_storage(self, fail_fast): i = 0 errors_count = 0 for key in self.get_all_keys(): i += 1 if (i % 100) == 0: self.logger.debug("file N {}".format(i)) info = self.get_saved_file(key) sha256 = build_dislosures_sha256_by_file_data( info.file_contents, info.file_extension) if sha256 != key: errors_count += 1 self.logger.error( "key {} has invalid data, length={}, file_extension={}, bin_file_index={}" .format(key, info.file_size, info.file_extension, info.file_no)) if fail_fast: self.logger.error("stop checking") return False self.logger.info("checked {} documents, errors_count={}".format( i, errors_count)) return errors_count == 0
def _do_PUT(self): if self.path is None: self.send_error_404("no file specified") return action = os.path.dirname(self.path) query_components = dict() if not self.parse_cgi(query_components): self.send_error_404('bad request') return _, file_extension = os.path.splitext(os.path.basename(self.path)) if file_extension.find('?') != -1: file_extension = file_extension[:file_extension.find('?')] action = action.strip('//') if action == "convert_if_absent": rebuild = False elif action == "convert_mandatory": rebuild = True else: self.send_error_404( "bad action (file path), can be 'convert_mandatory' or 'convert_if_absent', got \"{}\"" .format(action)) return if file_extension not in ALLOWED_FILE_EXTENSTIONS: self.send_error_404("bad file extension: {}, can be {}".format( file_extension, ALLOWED_FILE_EXTENSTIONS)) return file_length = int(self.headers['Content-Length']) max_file_size = 2**25 if file_length > max_file_size: self.send_error_404( "file is too large (size must less than {} bytes ".format( max_file_size)) return self.server.logger.debug("receive file {} length {}".format( self.path, file_length)) try: file_bytes = self.rfile.read(file_length) except ConnectionError as exp: self.send_error_404("ConnectionError : {}".format(exp)) return sha256 = build_dislosures_sha256_by_file_data(file_bytes, file_extension) if not rebuild and self.server.convert_storage.has_converted_file( sha256): self.send_response(201, 'Already exists') self.end_headers() return if not self.server.save_new_file( sha256, file_bytes, file_extension, rebuild, only_winword_conversion=query_components.get( "only_winword_conversion", False), only_ocr=query_components.get("only_ocr", False)): self.send_response( 201, 'Already registered as a conversion task, wait ') self.end_headers() return self.server.all_put_files_count += 1 self.send_response(201, 'Cre ated') self.end_headers()
import os import sys if __name__ == '__main__': folder = sys.argv[1] for x in os.listdir(folder): if x.endswith(".pdf") and len(x) < 20: pdf_file = os.path.join(folder, x) docx_file = os.path.join(folder, x + ".docx") if not os.path.exists(docx_file): sys.stderr.write("cannot find {}\n".format(docx_file)) sys.stderr.write("delete {}\n".format(pdf_file)) os.unlink(pdf_file) continue sha256hash = build_dislosures_sha256_by_file_data(pdf_file) new_pdf_file = os.path.join(folder, sha256hash + ".pdf") if os.path.exists(new_pdf_file): sys.stderr.write( "{} already exists, skip renaming\n".format(pdf_file)) sys.stderr.write("delete {}\n".format(pdf_file)) os.unlink(pdf_file) sys.stderr.write("delete {}\n".format(docx_file)) os.unlink(docx_file) continue sys.stderr.write("rename {} to {}\n".format( pdf_file, new_pdf_file)) os.rename(pdf_file, new_pdf_file) os.rename(docx_file, new_pdf_file + ".docx")
html_extension = '.html' #html_extension = '.docx' html_keys = list() while key is not None: value = source_doc_db.saved_file_params[key].decode('utf8') if value.find(';{};'.format(html_extension)) != -1: html_keys.append(key) key = source_doc_db.saved_file_params.nextkey(key) logger.info("found {} html keys".format(len(html_keys))) for key in html_keys: value = source_doc_db.saved_file_params[key].decode('utf8') html_data, file_extension = source_doc_db.get_saved_file(key) assert html_extension == file_extension new_sha256 = build_dislosures_sha256_by_file_data( html_data, html_extension) logger.debug("{} -> {}".format(key, new_sha256)) source_doc_db.saved_file_params[ new_sha256] = source_doc_db.saved_file_params[key] del source_doc_db.saved_file_params[key] key_utf8 = key.decode('utf8') old_smart_parser_key = "{},{}".format(key_utf8, 0.1) new_smart_parser_key = "{},{}".format(new_sha256, 0.1) if old_smart_parser_key in smart_parser_dbm: logger.debug("{} -> {}".format(old_smart_parser_key, new_smart_parser_key)) smart_parser_dbm[new_smart_parser_key] = smart_parser_dbm[ old_smart_parser_key] del smart_parser_dbm[old_smart_parser_key] else: