def get_info(self, xmlfile): #返回字典 try: xml = parse_xml(xmlfile, show_log=False) #判断文件 xml.xml_exists() xml.get_root() all = {} #dirc={} childs = xml.get_element_children(xml.root) for child in childs: plat = child.get("platform") cid = child.get("id") ip = child.get("ip") sshport = int(child.get("port")) mark = plat + "_s" + cid dirc = {mark: {}} dirc[mark].update({ "ip": ip, "id": cid, "platform": plat, "port": sshport }) all.update(dirc) if self.show_log: for agent in all: print agent return all except Exception as err: print "获取失败.错误如下:", err
def read_label_from_xml(label_path): """Read label from xml file. # Returns: labe l_dic (dictionary): labels for one sequence. size (list): Bounding Box Size. [l, w. h]? """ labels = pt.parse_xml(label_path) label_dic = {} for label in labels: first_frame = label.first_frame num_frames = label.num_frames size = label.size obj_type = label.object_type for index, place, rot in zip( range(first_frame, first_frame + num_frames), label.trans, label.rots): if index in label_dic.keys(): label_dic[index]["trans"] = np.vstack( (label_dic[index]["trans"], place)) label_dic[index]["size"] = np.vstack( (label_dic[index]["size"], np.array(size))) label_dic[index]["rot"] = np.vstack( (label_dic[index]["rot"], rot)) else: label_dic[index] = {} label_dic[index]["trans"] = place label_dic[index]["rot"] = rot label_dic[index]["size"] = np.array(size) return label_dic, size
def get_info(self,xmlfile): #返回字典 try: xml=parse_xml(xmlfile,show_log=False) #判断文件 xml.xml_exists() xml.get_root() all={} #dirc={} childs=xml.get_element_children(xml.root) for child in childs: plat=child.get("platform") cid=child.get("id") ip=child.get("ip") sshport=int(child.get("port")) mark=plat+"_s"+cid dirc={mark:{}} dirc[mark].update({"ip":ip,"id":cid,"platform":plat,"port":sshport}) all.update(dirc) if self.show_log: for agent in all: print agent return all except Exception as err: print "获取失败.错误如下:",err
def get_repeat_ip(self, xmlfile): try: xml = parse_xml(xmlfile, show_log=False) #判断文件 xml.xml_exists() if self.writefile: name = xmlfile.split(".")[0] namefile = "%s_ip.txt" % name if os.path.exists(namefile): os.remove(namefile) file = open(namefile, 'w+') xml.get_root() childs = xml.get_element_children(xml.root) ips = [] for child in childs: ips.append(child.get("ip")) ips = sorted(list(set(ips))) if self.show_log: for ip in ips: print ip if self.writefile: file.write(ip) file.write("\n") if self.writefile: file.close() return ips except Exception as err: print err
def get_repeat_ip(self,xmlfile): try: xml=parse_xml(xmlfile,show_log=False) #判断文件 xml.xml_exists() if self.writefile: name=xmlfile.split(".")[0] namefile="%s_ip.txt"%name if os.path.exists(namefile): os.remove(namefile) file=open(namefile,'w+') xml.get_root() childs=xml.get_element_children(xml.root) ips=[] for child in childs: ips.append(child.get("ip")) ips=sorted(list(set(ips))) if self.show_log: for ip in ips: print ip if self.writefile: file.write(ip) file.write("\n") if self.writefile: file.close() return ips except Exception as err: print err
def main(): # List organization names found in document orgs = [] file = input('Enter the name of a file you wish to enhance: ') namespaces = {'gmi': "http://www.isotc211.org/2005/gmi", 'gmd': "http://www.isotc211.org/2005/gmd", 'gco': "http://www.isotc211.org/2005/gco", 'gml': "http://www.opengis.net/gml/3.2", 'gmx': "http://www.isotc211.org/2005/gmx", 'gsr': "http://www.isotc211.org/2005/gsr", 'gss': "http://www.isotc211.org/2005/gss", 'gts': "http://www.isotc211.org/2005/gts", 'xlink': "http://www.w3.org/1999/xlink", 'xsi': "http://www.w3.org/2001/XMLSchema-instance", 'saxon': "http://saxon.sf.net/", 'srv': "http://www.isotc211.org/2005/srv", 'schemaLocation': "http://www.isotc211.org/2005/gmi http://www.ngdc.noaa.gov/metadata/" "published/xsd/schema.xsd"} #tree = ET.parse(urlopen(file)) tree = ET.parse(file) root = tree.getroot() for prefix in namespaces: ET.register_namespace(prefix, namespaces[prefix]) # Read document and populate orgs list with raw organization names found if parse_xml(root, orgs): if len(orgs) == 0: print('No organization names could be found in the document. Exiting...') exit() option = input(choice) if option == 'I': for each in orgs: print(each.name) next_thing = input('(E)nhance organization or press any other key to continue ' 'iterating through organizations found.\n') if next_thing == 'E': each.validate_in_viaf() print(each.enhancement_info()) if next_thing == 'C': continue enhanced_orgs = [] enhance_doc = input('Enhance document? If no, program will exit [Y/N]: ') if enhance_doc == 'N': exit() if enhance_doc == 'Y': for each in orgs: if each.validated: enhanced_orgs.append(each) if len(enhanced_orgs) == 0: print('No enhancements could be made. Exiting.') exit() # Enhance xml document enhance_xml(root, enhanced_orgs) rough_string = ET.tostring(root, 'utf-8') parsed = xml.dom.minidom.parseString(rough_string) #print(parsed.toprettyxml('\t')) f = open('enhanced.xml', 'wb+') f.write(parsed.toxml('utf-8')) f.close() print('Enhanced document created. See enhanced.xml') else: print('Problem reading document')
def snip_illustrations(zp, filename, id, vol, page): try: img_color = cv2.imread(extract_jp2(zp, filename)) delete_jp2(filename) identifier, title, author, pubplace, publisher, guesseddate = parse_xml(id) if not os.path.exists(os.path.join(cover_path, guesseddate)): os.mkdir(os.path.join(cover_path, guesseddate)) cv2.imwrite(os.path.join(cover_path, guesseddate, "{0}_{1}_{2}_{3}.jpg".format(id, vol, make_safe(title), guesseddate)), img_color, [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93] ) print("{0}_{1}_{2}_{3}.jpg".format(id, vol, make_safe(title), guesseddate)) except KeyError, e: r.rpoplpush(workerid, "coverproblems")
def read_objects(tracklet_file, num_frames): objects = [] # grouped by frames for n in range(num_frames): objects.append([]) # read tracklets from file tracklets = parse_xml(tracklet_file) num = len(tracklets) for n in range(num): tracklet = tracklets[n] # this part is inspired by kitti object development kit matlab code: computeBox3D h, w, l = tracklet.size trackletBox = np.array([ # in velodyne coordinates around zero point and without orientation yet\ [-l / 2, -l / 2, l / 2, l / 2, -l / 2, -l / 2, l / 2, l / 2], [w / 2, -w / 2, -w / 2, w / 2, w / 2, -w / 2, -w / 2, w / 2], [0.0, 0.0, 0.0, 0.0, h, h, h, h]]) # loop over all data in tracklet t = tracklet.firstFrame for translation, rotation, state, occlusion, truncation, amtOcclusion, amtBorders, absoluteFrameNumber in tracklet: # determine if object is in the image; otherwise continue if truncation not in (TRUNC_IN_IMAGE, TRUNC_TRUNCATED): continue # re-create 3D bounding box in velodyne coordinate system yaw = rotation[2] # other rotations are 0 in all xml files I checked assert np.abs(rotation[:2]).sum() == 0, 'object rotations other than yaw given!' rotMat = np.array([ [np.cos(yaw), -np.sin(yaw), 0.0], [np.sin(yaw), np.cos(yaw), 0.0], [0.0, 0.0, 1.0] ]) cornerPosInVelo = np.dot(rotMat, trackletBox) + np.tile(translation, (8, 1)).T # calc yaw as seen from the camera (i.e. 0 degree = facing away from cam), as opposed to # car-centered yaw (i.e. 0 degree = same orientation as car). # makes quite a difference for objects in periphery! # Result is in [0, 2pi] x, y, z = translation yawVisual = (yaw - np.arctan2(y, x)) % (2 * math.pi) o = type('', (), {})() o.box = cornerPosInVelo.transpose() o.type = tracklet.objectType o.tracklet_id = n objects[t].append(o) t = t + 1 return objects
def gather_signals(filepath): print("Gathering signals for '{0}'".format(filepath)) filename = filepath.split("/")[-1] id, vol, page, imgno = breakdown_imagename(filename.split("/")[-1]) metadata = parse_xml(id) cv_signals = [] for cv_measure in CV_SIGNALS: signal = cv_measure(filepath) if isinstance(signal, tuple): cv_signals.extend(signal) else: cv_signals.append(signal) return {'metadata': metadata, 'identifier': (id, vol, page, imgno), 'cv_signals': cv_signals}
def convert_component(path: Union[str, PathLike], mc=False) -> Path: root_file = Path(path) if not root_file.exists(): raise FileNotFoundError(f'File does not exists: {root_file}') elif root_file.suffix != '.xml': raise ValueError(f'Expected xml file, got {root_file.suffix}') rename_to = root_file.with_suffix('.dot') if rename_to.exists(): pass #return rename_to graph_repr = parse_xml(root_file) modal_to_dot(str(rename_to.absolute()), graph_repr, mc=mc) return rename_to
def get_all_info(self, xmlfile): xml = parse_xml(xmlfile, show_log=False) #判断文件 xml.xml_exists() xml.get_root() all = [] childs = xml.get_element_children(xml.root) for child in childs: l = [] cid = child.get("id") ip = child.get("ip") plat = child.get("platform") l.append(cid) l.append(ip) l.append(plat) all.append(l) return all
def get_all_info(self,xmlfile): xml=parse_xml(xmlfile,show_log=False) #判断文件 xml.xml_exists() xml.get_root() all=[] childs=xml.get_element_children(xml.root) for child in childs: l=[] cid=child.get("id") ip=child.get("ip") plat=child.get("platform") l.append(cid) l.append(ip) l.append(plat) all.append(l) return all
def convert(args): default_options = { 'input': '', 'output': '', 'mc': False, 'auto_group': False } default_options.update(vars(args)) ipath = Path(default_options['input']) opath = Path(default_options['output']) pprint(default_options) if not ipath.exists(): log.error(f'Given path does not exists, can not read: {ipath}') exit(-1) if ipath.suffix not in ['.dot', '.xml']: log.error(f'Unknown suffix: {ipath.suffix}') exit(-1) if opath.suffix not in ['.dot', '.xml']: log.error(f'Unknown suffix: {opath.suffix}') exit(-1) mts_repr = None if ipath.suffix == '.dot': log.debug(f'Read input (dot): {ipath}') mts_repr = parse_dot(ipath) elif ipath.suffix == '.xml': log.debug(f'Read input (xml): {ipath}') mts_repr = parse_xml(ipath) if opath.suffix == '.dot': log.info(f'Writing to {opath} ...') modal_to_dot(path=opath, mts=mts_repr, mc=default_options['mc'], derive_groups=default_options['auto_group']) elif opath.suffix == '.xml': log.info(f'Writing to {opath} ...') modal_to_xml(path=opath, mts=mts_repr, mc=default_options['mc'], derive_groups=default_options['auto_group'])
def snip_illustrations(zp, filename, id, vol, page): try: img_color = cv2.imread(extract_jp2(zp, filename)) delete_jp2(filename) identifier, title, author, pubplace, publisher, guesseddate = parse_xml( id) if not os.path.exists(os.path.join(cover_path, guesseddate)): os.mkdir(os.path.join(cover_path, guesseddate)) cv2.imwrite( os.path.join( cover_path, guesseddate, "{0}_{1}_{2}_{3}.jpg".format(id, vol, make_safe(title), guesseddate)), img_color, [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93]) print("{0}_{1}_{2}_{3}.jpg".format(id, vol, make_safe(title), guesseddate)) except KeyError, e: r.rpoplpush(workerid, "coverproblems")
def get_md(item): job, flickr_id = item.strip().split("\t") img = job.strip()[24:] location = os.path.join(embellishments_path, img) id, vol, page, _ = img.split("/", 1)[1].split("_", 3) metadata = parse_xml(id) identifier, title, author, pubplace, publisher, guesseddate = metadata decoded = map(lambda x: x.encode("utf-8"), [ title, author, guesseddate, pubplace, publisher, vol, str(int(page)), identifier ]) uploaded_title = u"Image taken from page {1} of '{0}'".format( decoded[0].decode("utf-8"), page.decode("utf-8")) try: uploaded_title = u"Image taken from page {1} of '{0}'".format( decoded[0].decode("utf-8"), str(int(page)).decode("utf-8")) except: pass if os.path.exists(location): try: rosetta = r.get("s:" + id) if rosetta: d, a, u = rosetta.strip().split("\t") adjusted_ark_number = "{0:012x}".format( int(a.split("_")[1], 16) - 1) adj_ark = "{0}_{1}".format( a.split("_")[0], adjusted_ark_number.upper()) hexpage = "0x000001" try: hexpage = "{0:06x}".format(int(page)).upper() except: pass additional = """ <ul><li>Open the page in the <a href="http://itemviewer.bl.uk/?itemid={2}#{1}.0x{0}">British Library's itemViewer (page: {3})</a></li> <li><a href="http://access.dl.bl.uk/{2}">Download the PDF for this book</a> """.format(hexpage, adj_ark, u, page) decoded += [additional] else: decoded += [u""] except Exception, e: print e
def get_md(item): job, flickr_id = item.strip().split("\t") img = job.strip()[24:] location = os.path.join(embellishments_path, img) id, vol, page, _ = img.split("/",1)[1].split("_",3) metadata = parse_xml(id) identifier, title, author, pubplace, publisher, guesseddate = metadata decoded = map(lambda x: x.encode("utf-8"), [title, author, guesseddate, pubplace, publisher, vol, str(int(page)), identifier]) uploaded_title=u"Image taken from page {1} of '{0}'".format(decoded[0].decode("utf-8"), page.decode("utf-8")) try: uploaded_title = u"Image taken from page {1} of '{0}'".format(decoded[0].decode("utf-8"), str(int(page)).decode("utf-8")) except: pass if os.path.exists(location): try: rosetta = r.get("s:"+id) if rosetta: d, a, u = rosetta.strip().split("\t") adjusted_ark_number = "{0:012x}".format(int(a.split("_")[1], 16) - 1) adj_ark = "{0}_{1}".format(a.split("_")[0], adjusted_ark_number.upper()) hexpage = "0x000001" try: hexpage = "{0:06x}".format(int(page)).upper() except: pass additional = """ <ul><li>Open the page in the <a href="http://itemviewer.bl.uk/?itemid={2}#{1}.0x{0}">British Library's itemViewer (page: {3})</a></li> <li><a href="http://access.dl.bl.uk/{2}">Download the PDF for this book</a> """.format(hexpage, adj_ark, u, page) decoded += [additional] else: decoded += [u""] except Exception, e: print e
def snip_illustrations(zp, filename, altoxmloldpath, scale=1.1, threshold=1200 * 600): id, vol, page = altopath_to_idvol(altoxmloldpath) altofilepath = old_winpath_to_vmpath(altoxmloldpath) doc = [] with open(altofilepath, "r") as xmlfile: doc = ET.fromstring(xmlfile.read()) page_shape, images = get_illustration_coords(doc) if images and [x for x in images if x[2] * x[3] < threshold]: counter = 0 skippable = True for rect in images: if rect[2] * rect[3] < threshold: skippable = False if skippable: print("No suitable images found within '{0}' - skipping".format( filename)) return else: print("Page contains a relevant illustration, not skipping.") img_color = "" #img_color = cv2.imread(extract_jp2(zp, filename)) #delete_jp2(filename) #h,w,_ = img_color.shape #current_page_shape = (w,h) #dh = float(h)/float(page_shape[1]) #dw = float(w)/float(page_shape[0]) identifier, title, author, pubplace, publisher, guesseddate = parse_xml( id) for rect in images: counter += 1 if rect[2] * rect[3] < threshold: print("Attempting to slice '{3}' from {0}_{1}_{2}".format( id, vol, page, rect)) #scaled = increase_size(rect, scale, (dw,dh), current_page_shape) #boundary = get_rect(scaled) if len(guesseddate) != 4: guesseddate = "Unknown" if not os.path.exists(os.path.join(embellish_path, guesseddate)): os.mkdir(os.path.join(embellish_path, guesseddate)) img_filename = os.path.join( embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format( id, vol, page, counter, make_safe(title), guesseddate)) short_filename = os.path.join( embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format( id, vol, page, counter, make_safe(title)[:80], guesseddate)) # if not already captured this, switch to new short title and carry on if not os.path.exists(img_filename): img_filename = short_filename # Has this been captured before? try: if not os.path.exists(img_filename): if img_color == "": img_color = cv2.imread( extract_jp2(zp, filename)) delete_jp2(filename) h, w, _ = img_color.shape current_page_shape = (w, h) dh = float(h) / float(page_shape[1]) dw = float(w) / float(page_shape[0]) scaled = increase_size(rect, scale, (dw, dh), current_page_shape) boundary = get_rect(scaled) cv2.imwrite( img_filename, img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93]) print("{0}_{1}_{2}_{3}_{4}_{5}.jpg".format( id, vol, page, counter, make_safe(title), guesseddate)) if os.path.isfile(img_filename): r.lpush("secondpassembellish", img_filename) noplates = r.llen("secondpassembellish") print("{1} - {0}".format( img_filename, noplates)) else: print("CV2 SECRETLY FAILED...") r.lpush("embellisherror", img_filename) else: print("Already captured, skipping") except: cv2.imwrite( os.path.join( embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}.jpg".format( id, vol, page, counter, guesseddate)), img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93]) print("{0}_{1}_{2}_{3}_{4}.jpg".format( id, vol, page, counter, guesseddate)) else: print("Already captured, skipping") else: print( "All illustration areas in {1}(vol:{2}, pg: {3} are above threshold area size (currently: {0})" .format(threshold, id, vol, page))
import flickr_api a = flickr_api.auth.AuthHandler.load("BLLibraryAuth") flickr_api.set_auth_handler(a) if __name__ == "__main__": workerid = "uploadwrk"+sys.argv[1] while(True): job = get_job(workerid) while(job): if job != "": img = job.strip() location = os.path.join(embellishments_path, img) # get id id, vol, page, _ = img.split("/",1)[1].split("_",3) metadata = parse_xml(id) """ Arguments: photo_file The file to upload. title (optional) The title of the photo. description (optional) A description of the photo. May contain some limited HTML. tags (optional) A space-seperated list of tags to apply to the photo. is_public, is_friend, is_family (optional) Set to 0 for no, 1 for yes. Specifies who can view the photo. safety_level (optional) Set to 1 for Safe, 2 for Moderate, or 3 for Restricted. content_type (optional)
def snip_illustrations(zp, filename, altoxmloldpath, scale = 1.1, threshold = 1200*600, zipfilename=""): id, vol, page = altopath_to_idvol(altoxmloldpath) start, end = get_extent(id, zipfilename) if int(page) == start or int(page) == end: print("This looks like the book cover - page '{0}' - extent ({1}, {2})".format(int(page), start, end)) return altofilepath = old_winpath_to_vmpath(altoxmloldpath) doc = [] with open(altofilepath, "r") as xmlfile: doc = ET.fromstring(xmlfile.read()) page_shape, images = get_illustration_coords(doc) if images and [x for x in images if x[2]*x[3] > threshold]: counter = 0 img_color = cv2.imread(extract_jp2(zp, filename)) delete_jp2(filename) h,w,_ = img_color.shape current_page_shape = (w,h) dh = float(h)/float(page_shape[1]) dw = float(w)/float(page_shape[0]) identifier, title, author, pubplace, publisher, guesseddate = parse_xml(id) for rect in images: counter += 1 if rect[2]*rect[3] > threshold: print("Attempting to slice '{1}' from {0}".format(filename, rect)) scaled = increase_size(rect, scale, (dw,dh), current_page_shape) boundary = get_rect(scaled) if len(guesseddate) != 4: guesseddate = "Unknown" if not os.path.exists(os.path.join(embellish_path, guesseddate)): os.mkdir(os.path.join(embellish_path, guesseddate)) img_filename = os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(id, vol, page, counter, make_safe(title), guesseddate)) short_filename = os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(id, vol, page, counter, make_safe(title)[:80], guesseddate)) # if not already captured this, switch to new short title and carry on if not os.path.exists(img_filename): img_filename = short_filename # Has this been captured before? try: if not os.path.exists(img_filename): cv2.imwrite(img_filename, img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93] ) if os.path.isfile(img_filename): r.lpush(plateimgs, img_filename) noplates = r.llen(plateimgs) print("Plate no: {1} - {0}".format(img_filename, noplates)) else: print("CV2 SECRETLY FAILED...") r.lpush("plateerror", altoxmloldpath) else: print("Already captured, skipping") except: img_filename = os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}.jpg".format(id, vol, page, counter, guesseddate)) cv2.imwrite(img_filename, img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93]) if os.path.isfile(img_filename): r.lpush(plateimgs, img_filename) noplates = r.llen(plateimgs) print("Plate no: {1} - {0}".format(img_filename, noplates)) else: print("CV2 SECRETLY FAILED...") r.lpush("plateerror", altoxmloldpath) else: print("All illustration areas in {1}(vol:{2}, pg: {3} are below threshold area size (currently: {0})".format(threshold, id, vol, page))
def snip_illustrations(zp, filename, altoxmloldpath, scale = 1.1, threshold = 1200*600): id, vol, page = altopath_to_idvol(altoxmloldpath) altofilepath = old_winpath_to_vmpath(altoxmloldpath) doc = [] with open(altofilepath, "r") as xmlfile: doc = ET.fromstring(xmlfile.read()) page_shape, images = get_illustration_coords(doc) if images and [x for x in images if x[2]*x[3] < threshold]: counter = 0 skippable = True for rect in images: if rect[2]*rect[3] < threshold: skippable = False if skippable: print("No suitable images found within '{0}' - skipping".format(filename)) return else: print("Page contains a relevant illustration, not skipping.") img_color = "" #img_color = cv2.imread(extract_jp2(zp, filename)) #delete_jp2(filename) #h,w,_ = img_color.shape #current_page_shape = (w,h) #dh = float(h)/float(page_shape[1]) #dw = float(w)/float(page_shape[0]) identifier, title, author, pubplace, publisher, guesseddate = parse_xml(id) for rect in images: counter += 1 if rect[2]*rect[3] < threshold: print("Attempting to slice '{3}' from {0}_{1}_{2}".format(id, vol, page, rect)) #scaled = increase_size(rect, scale, (dw,dh), current_page_shape) #boundary = get_rect(scaled) if len(guesseddate) != 4: guesseddate = "Unknown" if not os.path.exists(os.path.join(embellish_path, guesseddate)): os.mkdir(os.path.join(embellish_path, guesseddate)) img_filename = os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(id, vol, page, counter, make_safe(title), guesseddate)) short_filename = os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(id, vol, page, counter, make_safe(title)[:80], guesseddate)) # if not already captured this, switch to new short title and carry on if not os.path.exists(img_filename): img_filename = short_filename # Has this been captured before? try: if not os.path.exists(img_filename): if img_color == "": img_color = cv2.imread(extract_jp2(zp, filename)) delete_jp2(filename) h,w,_ = img_color.shape current_page_shape = (w,h) dh = float(h)/float(page_shape[1]) dw = float(w)/float(page_shape[0]) scaled = increase_size(rect, scale, (dw,dh), current_page_shape) boundary = get_rect(scaled) cv2.imwrite(img_filename, img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93] ) print("{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(id, vol, page, counter, make_safe(title), guesseddate)) if os.path.isfile(img_filename): r.lpush("secondpassembellish", img_filename) noplates = r.llen("secondpassembellish") print("{1} - {0}".format(img_filename, noplates)) else: print("CV2 SECRETLY FAILED...") r.lpush("embellisherror", img_filename) else: print("Already captured, skipping") except: cv2.imwrite(os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}.jpg".format(id, vol, page, counter, guesseddate)), img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93]) print("{0}_{1}_{2}_{3}_{4}.jpg".format(id, vol, page, counter, guesseddate)) else: print("Already captured, skipping") else: print("All illustration areas in {1}(vol:{2}, pg: {3} are above threshold area size (currently: {0})".format(threshold, id, vol, page))
def snip_illustrations(zp, filename, altoxmloldpath, scale=1.1, threshold=1200 * 600, zipfilename=""): id, vol, page = altopath_to_idvol(altoxmloldpath) start, end = get_extent(id, zipfilename) if int(page) == start or int(page) == end: print( "This looks like the book cover - page '{0}' - extent ({1}, {2})". format(int(page), start, end)) return altofilepath = old_winpath_to_vmpath(altoxmloldpath) doc = [] with open(altofilepath, "r") as xmlfile: doc = ET.fromstring(xmlfile.read()) page_shape, images = get_illustration_coords(doc) if images and [x for x in images if x[2] * x[3] > threshold]: counter = 0 img_color = cv2.imread(extract_jp2(zp, filename)) delete_jp2(filename) h, w, _ = img_color.shape current_page_shape = (w, h) dh = float(h) / float(page_shape[1]) dw = float(w) / float(page_shape[0]) identifier, title, author, pubplace, publisher, guesseddate = parse_xml( id) for rect in images: counter += 1 if rect[2] * rect[3] > threshold: print("Attempting to slice '{1}' from {0}".format( filename, rect)) scaled = increase_size(rect, scale, (dw, dh), current_page_shape) boundary = get_rect(scaled) if len(guesseddate) != 4: guesseddate = "Unknown" if not os.path.exists(os.path.join(embellish_path, guesseddate)): os.mkdir(os.path.join(embellish_path, guesseddate)) img_filename = os.path.join( embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format( id, vol, page, counter, make_safe(title), guesseddate)) short_filename = os.path.join( embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format( id, vol, page, counter, make_safe(title)[:80], guesseddate)) # if not already captured this, switch to new short title and carry on if not os.path.exists(img_filename): img_filename = short_filename # Has this been captured before? try: if not os.path.exists(img_filename): cv2.imwrite( img_filename, img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93]) if os.path.isfile(img_filename): r.lpush(plateimgs, img_filename) noplates = r.llen(plateimgs) print("Plate no: {1} - {0}".format( img_filename, noplates)) else: print("CV2 SECRETLY FAILED...") r.lpush("plateerror", altoxmloldpath) else: print("Already captured, skipping") except: img_filename = os.path.join( embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}.jpg".format( id, vol, page, counter, guesseddate)) cv2.imwrite( img_filename, img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93]) if os.path.isfile(img_filename): r.lpush(plateimgs, img_filename) noplates = r.llen(plateimgs) print("Plate no: {1} - {0}".format( img_filename, noplates)) else: print("CV2 SECRETLY FAILED...") r.lpush("plateerror", altoxmloldpath) else: print( "All illustration areas in {1}(vol:{2}, pg: {3} are below threshold area size (currently: {0})" .format(threshold, id, vol, page))
r = Redis() wq = "q" output = "output" def get_job(workerid): if r.llen(workerid) == 0: status = r.rpoplpush(wq, workerid) return status else: return r.lrange(workerid, 0, 0)[0] def clear_job(workerid, job): r.lrem(workerid, job, 1) if __name__ == "__main__": workerid = "wrk"+sys.argv[1] while(True): job = get_job(workerid) while(job): try: row = parse_xml(job) r.lpush(output, u"\t".join(row).encode("utf-8")) clear_job(workerid, job) except OSError, e: clear_job(workerid, job) job = get_job(workerid) print("%s ran out of jobs - waiting for 10 seconds before checking again" % workerid) time.sleep(10)