def un_pack(compressed_files_path): try: file_name = os.path.basename(compressed_files_path) file_info = file_name.rsplit(".", 1) name = file_info[0] suffix = file_info[1] # 拼接文件解压路径 uncompress_dir = os.path.join( os.path.dirname(compressed_files_path), name) if not os.path.exists(uncompress_dir): os.makedirs(uncompress_dir) else: shutil.rmtree(uncompress_dir) os.makedirs(uncompress_dir) # 根据文件类型做解压操作 if suffix in ["rar"]: # RarFile(compressed_files_path).extractall(uncompress_dir) os.system( f"unrar x -o- -y {compressed_files_path} {uncompress_dir}") elif suffix in ["zip", "Zip"]: shutil.unpack_archive(compressed_files_path, uncompress_dir, suffix) os.remove(compressed_files_path) return uncompress_dir except Exception as e: logger.exception(e) return None
def del_file(dir_path): try: for file in os.listdir(dir_path): if file.startswith("."): os.remove(file) except Exception as e: logger.exception(e)
def _word_to_pdf_batch(dir_path, transform=None, delete=True): if transform is None: transform = ["doc", "docx"] try: if dir_path and os.path.exists(dir_path) and transform: for file_type in transform: if os.system( f"soffice --headless -convert-to pdf {dir_path}/*.{file_type} --outdir {dir_path}" ) in [0, "0"] and delete: try: os.system(rf"rm -f {dir_path}/*.{file_type}") logger.info(f"rm -f {dir_path}/*.{file_type}") except Exception as e: logger.exception(e) except Exception as e: logger.exception(e)
def recursion_decompressing(self, files_path): """ 对传入对文件夹遍历判断需要再次解压的文件,进行解压处理 Args: files_path: 上一层解压后文件集中路径 Returns: """ try: if files_path and os.path.exists(files_path): for file in os.listdir(files_path): if file.rsplit(".", 1)[-1] in [ "rar", "zip" ] and not file.startswith("."): abs_file_path = os.path.join(files_path, file) self.uncompress(abs_file_path) except Exception as e: logger.exception(e)
def get_message(self, key, func): try: index = 0 while True: message = cache.rpop(key) if message: logger.info(f"{key}, message{message}") # message = deserialization(self.__cls__().Default, message, url="default") merge_process = Thread(target=func, args=(eval(message), )) merge_process.start() index = 0 else: if index >= self.timeout: exit() index += 1 # logger.info(f"{key}: {index}") time.sleep(1) except Exception as e: logger.exception(e)
def check_libre_office_status(): """ 检查系统是否安装了libreoffice,没有安装的话,进行软件的安装 Returns: """ try: office_info = os.system("libreoffice --version") logger.info(f"{office_info}") if office_info not in [0, "0"]: logger.info(f"系统内部没有安装") logger.info(f"start install libreoffice") if os.system("yum install -y libreoffice") in [0, "0"]: logger.info(f"安装成功") else: logger.info(f"安装失败") else: logger.info(f"系统已安装") except Exception as e: logger.exception(e)
def start(self): try: self.push_item_in_redis_list(self.message_a, self.start_item) thr_downloader = Thread(target=self.get_message, args=(self.message_a, self.download_page)) thr_primary = Thread(target=self.get_message, args=(self.message_b, self.primary)) thr_merge_result = Thread(target=self.get_message, args=(self.message_c, self.merger_result)) thr_downloader.start() logger.info("下载线程开启!") time.sleep(3) thr_primary.start() logger.info("抽取线程开启!") thr_merge_result.start() logger.info("数据合并线程开启") thr_merge_result.join() return self.result except Exception as identifier: logger.exception(identifier)
def word_to_pdf_single(word_path: str, dir_path, delete=True): """ ubuntu中使用命令行将doc、docx文件转换为pdf,并根据状态是否删除源文件 Args: word_path: doc、docx文件路径 dir_path: 转换后文件保存位置 delete: 是否删除,默认删除 Returns: """ try: if word_path and os.path.exists(word_path): try: status = os.system( f"soffice --headless -convert-to pdf {word_path} --outdir {dir_path}" ) if status in [0, "0"] and delete: try: os.remove(word_path) logger.info(f"rm {word_path}") except Exception as e: logger.exception(e) except Exception as e: logger.exception(e) except Exception as e: logger.exception(e)
def word_to_pdf_batch(self, dir_path): """ 文件夹内的word文件批量转换为PDF, 主要使用的是word中插入的图片,对文本没有要求 Args: dir_path: 需要处理的文件夹 Returns: """ if dir_path: try: all_word_files = [] for root, dirs, files in os.walk(dir_path): [ self.word_to_pdf_single(os.path.join(root, x), dir_path) for x in files if x.rsplit(".", 1)[-1] in ['doc', "docx"] and not x.startswith(".") ] logger.info(all_word_files) except Exception as e: logger.exception(e)