def extract_image(self): #读取数据集pdfurl中图片提取标志为0的url,并返回url list_url = self.db.urlno.find({"convertflag": 1, "extractflag": 0}, {"indexflag": 1}) i = 0 for urls in list_url: url = urls["indexflag"] #读取数据集convertimg中url想对应的转换图片 list_convert_img = self.db.convertimg.find({"indexflag": url}, {"convertpath": 1}) for item in list_convert_img: list_convert_imgs = item["convertpath"] #提取图片 list_save_path = [] for convert_img in list_convert_imgs: image = Image.open(convert_img) image = image.convert("L") extract_img = ExtractImage(image) list_each_save_path = extract_img.main() list_save_path.append(list_each_save_path) #分别提取不同路径 list_deal_image_path = [] list_extract_image_path = [] list_merge_image_path = [] for each_save_path in list_save_path: list_deal_image_path.append(each_save_path[0][0]) if len(each_save_path[0][1]): list_merge_image_path.append(each_save_path[0][1]) if len(each_save_path[0][2]): for each_extract_image_path in each_save_path[0][2]: list_extract_image_path.append(each_extract_image_path) #合并单页图片 final_merge_image_path = "" if len(list_merge_image_path): list_merge_image = [] for each_merge_image_path in list_merge_image_path: image = Image.open(each_merge_image_path) list_merge_image.append(image) extract_img = ExtractImage() final_merge_image_path = extract_img.image_merge(list_merge_image) #写入数据集dealimg,extractimg,mergeimg exist_result = self.db.dealimg.find_one({"indexflag": url}) if not exist_result: self.db.dealimg.insert({"indexflag": url, "dealpath": list_deal_image_path, "time": time.ctime()}) exist_result = self.db.extractimg.find_one({"indexflag": url}) if not exist_result: self.db.extractimg.insert({"indexflag": url, "extractpath": list_extract_image_path, "time": time.ctime()}) exist_result = self.db.mergeimg.find_one({"indexflag": url}) if not exist_result: self.db.mergeimg.insert({"indexflag": url, "mergepath": final_merge_image_path, "time": time.ctime()}) self.db.urlno.update({"indexflag": url}, {"$set": {"extractflag": 1}}) i = i + 1 print i if i > 9: break return