def extract_image(self, image): extract = ExtractImage(image) #横向提取 left_list_draw_line = extract.horizontal_draw_line(1, 2) #合并 left_list_split_line = extract.horizontal_merger_line(left_list_draw_line) #纵向提取 left_list_extract_image = extract.vertical_line(1, 2, left_list_split_line) #合并图片 list_extract_image = [] list_extract_image_path = [] for each_extract_image in left_list_extract_image: list_extract_image.append(each_extract_image[0]) list_extract_image_path.append(each_extract_image[1]) merge_save_path = "" if len(list_extract_image): merge_save_path = extract.image_merge(list_extract_image) #保存中间处理图片 deal_save_path = "static/images/dealimg/" + time.ctime() + ".jpg" extract.image.save("../%s" % deal_save_path) list_save_path = [] list_save_path.append([deal_save_path, merge_save_path, list_extract_image_path]) return list_save_path
def extract_image(self): #读取数据集urlno,获取专利号 results = self.db.urlno.find({"kind": "patent", "downflag": 1, "extractflag": 0}, {"_id": 0, "indexflag": 1, "zippath": 1}) if results: patent_handler = PatentClass() list_patentno = [] list_zip_path = [] for result in results: list_patentno.append(result["indexflag"]) list_zip_path.append(result["zippath"]) #解压文件 list_unzip_save_path = [] for patentno, zip_path in zip(list_patentno, list_zip_path): list_unzip_save_path = patent_handler.unzip_patent(zip_path) print "解压完成" #判断是否解压成功 if len(list_unzip_save_path): #反向提取图片 list_len = len(list_unzip_save_path) count_none = 0 list_deal_save_path = [] list_merge_save_path = [] list_extract_save_path = [] for i in xrange(list_len): image_path = list_unzip_save_path[list_len-i-1] #将tif暂时转成jpg PythonMagick.Image("../%s" % image_path).write("temptiftojpg.jpg") image = Image.open("temptiftojpg.jpg") image = image.convert("L") #提取图片 list_save_path = patent_handler.extract_image(image) if len(list_save_path[0][1]): list_deal_save_path.append(list_save_path[0][0]) list_merge_save_path.append(list_save_path[0][1]) list_extract_save_path.append(list_save_path[0][2]) else: count_none += 1 if count_none == 2: break #将多张图片合成一张大图 final_merge_save_path = "" if len(list_merge_save_path): list_merge_image = [] for merge_image_path in list_merge_save_path: image = Image.open("../%s" % merge_image_path) list_merge_image.append(image) patent_num = patentno extract = ExtractImage() final_merge_save_path = extract.image_merge(list_merge_image, patentno) #存入数据集,更新数据集urlno try: self.db.urlno.update({"indexflag": patentno}, {"$set": {"extractflag": 1, "dealpath": list_deal_save_path, "extractpath": list_extract_save_path, "mergepath": final_merge_save_path}}) print "提取成功" except: pass return
def extract_image(self): #读取数据集pdfurl中图片提取标志为0的url,并返回url list_url = self.db.urlno.find({"convertflag": 1, "extractflag": 0}, {"indexflag": 1}) i = 0 for urls in list_url: url = urls["indexflag"] #读取数据集convertimg中url想对应的转换图片 list_convert_img = self.db.convertimg.find({"indexflag": url}, {"convertpath": 1}) for item in list_convert_img: list_convert_imgs = item["convertpath"] #提取图片 list_save_path = [] for convert_img in list_convert_imgs: image = Image.open(convert_img) image = image.convert("L") extract_img = ExtractImage(image) list_each_save_path = extract_img.main() list_save_path.append(list_each_save_path) #分别提取不同路径 list_deal_image_path = [] list_extract_image_path = [] list_merge_image_path = [] for each_save_path in list_save_path: list_deal_image_path.append(each_save_path[0][0]) if len(each_save_path[0][1]): list_merge_image_path.append(each_save_path[0][1]) if len(each_save_path[0][2]): for each_extract_image_path in each_save_path[0][2]: list_extract_image_path.append(each_extract_image_path) #合并单页图片 final_merge_image_path = "" if len(list_merge_image_path): list_merge_image = [] for each_merge_image_path in list_merge_image_path: image = Image.open(each_merge_image_path) list_merge_image.append(image) extract_img = ExtractImage() final_merge_image_path = extract_img.image_merge(list_merge_image) #写入数据集dealimg,extractimg,mergeimg exist_result = self.db.dealimg.find_one({"indexflag": url}) if not exist_result: self.db.dealimg.insert({"indexflag": url, "dealpath": list_deal_image_path, "time": time.ctime()}) exist_result = self.db.extractimg.find_one({"indexflag": url}) if not exist_result: self.db.extractimg.insert({"indexflag": url, "extractpath": list_extract_image_path, "time": time.ctime()}) exist_result = self.db.mergeimg.find_one({"indexflag": url}) if not exist_result: self.db.mergeimg.insert({"indexflag": url, "mergepath": final_merge_image_path, "time": time.ctime()}) self.db.urlno.update({"indexflag": url}, {"$set": {"extractflag": 1}}) i = i + 1 print i if i > 9: break return