def get_info(self, xmlfile):
     #返回字典
     try:
         xml = parse_xml(xmlfile, show_log=False)
         #判断文件
         xml.xml_exists()
         xml.get_root()
         all = {}
         #dirc={}
         childs = xml.get_element_children(xml.root)
         for child in childs:
             plat = child.get("platform")
             cid = child.get("id")
             ip = child.get("ip")
             sshport = int(child.get("port"))
             mark = plat + "_s" + cid
             dirc = {mark: {}}
             dirc[mark].update({
                 "ip": ip,
                 "id": cid,
                 "platform": plat,
                 "port": sshport
             })
             all.update(dirc)
         if self.show_log:
             for agent in all:
                 print agent
         return all
     except Exception as err:
         print "获取失败.错误如下:", err
Esempio n. 2
0
def read_label_from_xml(label_path):
    """Read label from xml file.

    # Returns:
        labe    l_dic (dictionary): labels for one sequence.
        size (list): Bounding Box Size. [l, w. h]?
    """
    labels = pt.parse_xml(label_path)
    label_dic = {}
    for label in labels:
        first_frame = label.first_frame
        num_frames = label.num_frames
        size = label.size
        obj_type = label.object_type
        for index, place, rot in zip(
                range(first_frame, first_frame + num_frames), label.trans,
                label.rots):
            if index in label_dic.keys():
                label_dic[index]["trans"] = np.vstack(
                    (label_dic[index]["trans"], place))
                label_dic[index]["size"] = np.vstack(
                    (label_dic[index]["size"], np.array(size)))
                label_dic[index]["rot"] = np.vstack(
                    (label_dic[index]["rot"], rot))
            else:
                label_dic[index] = {}
                label_dic[index]["trans"] = place
                label_dic[index]["rot"] = rot
                label_dic[index]["size"] = np.array(size)
    return label_dic, size
	def get_info(self,xmlfile):
		#返回字典
		try:
			xml=parse_xml(xmlfile,show_log=False)
			#判断文件
			xml.xml_exists()		
			xml.get_root()
			all={}
			#dirc={}
			childs=xml.get_element_children(xml.root)
			for child in childs:
				plat=child.get("platform")
				cid=child.get("id")
				ip=child.get("ip")
				sshport=int(child.get("port"))
				mark=plat+"_s"+cid
				dirc={mark:{}}
				dirc[mark].update({"ip":ip,"id":cid,"platform":plat,"port":sshport})
				all.update(dirc)
			if self.show_log:
				for agent in all:
					print agent
			return all
		except Exception as err:
			print "获取失败.错误如下:",err
 def get_repeat_ip(self, xmlfile):
     try:
         xml = parse_xml(xmlfile, show_log=False)
         #判断文件
         xml.xml_exists()
         if self.writefile:
             name = xmlfile.split(".")[0]
             namefile = "%s_ip.txt" % name
             if os.path.exists(namefile):
                 os.remove(namefile)
             file = open(namefile, 'w+')
         xml.get_root()
         childs = xml.get_element_children(xml.root)
         ips = []
         for child in childs:
             ips.append(child.get("ip"))
         ips = sorted(list(set(ips)))
         if self.show_log:
             for ip in ips:
                 print ip
                 if self.writefile:
                     file.write(ip)
                     file.write("\n")
         if self.writefile:
             file.close()
         return ips
     except Exception as err:
         print err
	def get_repeat_ip(self,xmlfile):
		try:
			xml=parse_xml(xmlfile,show_log=False)
			#判断文件
			xml.xml_exists()
			if self.writefile:
				name=xmlfile.split(".")[0]
				namefile="%s_ip.txt"%name
				if os.path.exists(namefile):
					os.remove(namefile)			
				file=open(namefile,'w+')
			xml.get_root()
			childs=xml.get_element_children(xml.root)
			ips=[]
			for child in childs:
				ips.append(child.get("ip"))
			ips=sorted(list(set(ips)))
			if self.show_log:
				for ip in ips:
					print ip 
					if self.writefile:
						file.write(ip)
						file.write("\n")
			if self.writefile:
				file.close()					
			return ips
		except Exception as err:
			print err
def main():
    # List organization names found in document
    orgs = []
    file = input('Enter the name of a file you wish to enhance: ')
    namespaces = {'gmi': "http://www.isotc211.org/2005/gmi", 'gmd': "http://www.isotc211.org/2005/gmd", 'gco':
                  "http://www.isotc211.org/2005/gco", 'gml': "http://www.opengis.net/gml/3.2", 'gmx':
                  "http://www.isotc211.org/2005/gmx", 'gsr': "http://www.isotc211.org/2005/gsr",
                  'gss': "http://www.isotc211.org/2005/gss", 'gts': "http://www.isotc211.org/2005/gts",
                  'xlink': "http://www.w3.org/1999/xlink", 'xsi': "http://www.w3.org/2001/XMLSchema-instance",
                  'saxon': "http://saxon.sf.net/", 'srv': "http://www.isotc211.org/2005/srv",
                  'schemaLocation': "http://www.isotc211.org/2005/gmi http://www.ngdc.noaa.gov/metadata/"
                                    "published/xsd/schema.xsd"}
    #tree = ET.parse(urlopen(file))
    tree = ET.parse(file)
    root = tree.getroot()
    for prefix in namespaces:
        ET.register_namespace(prefix, namespaces[prefix])
    # Read document and populate orgs list with raw organization names found
    if parse_xml(root, orgs):
        if len(orgs) == 0:
            print('No organization names could be found in the document. Exiting...')
            exit()
        option = input(choice)
        if option == 'I':
            for each in orgs:
                print(each.name)
                next_thing = input('(E)nhance organization or press any other key to continue '
                                   'iterating through organizations found.\n')
                if next_thing == 'E':
                    each.validate_in_viaf()
                    print(each.enhancement_info())
                if next_thing == 'C':
                    continue
        enhanced_orgs = []
        enhance_doc = input('Enhance document? If no, program will exit [Y/N]: ')
        if enhance_doc == 'N':
            exit()
        if enhance_doc == 'Y':
            for each in orgs:
                if each.validated:
                    enhanced_orgs.append(each)
            if len(enhanced_orgs) == 0:
                print('No enhancements could be made. Exiting.')
                exit()
            # Enhance xml document
            enhance_xml(root, enhanced_orgs)
            rough_string = ET.tostring(root, 'utf-8')
            parsed = xml.dom.minidom.parseString(rough_string)
            #print(parsed.toprettyxml('\t'))
            f = open('enhanced.xml', 'wb+')
            f.write(parsed.toxml('utf-8'))
            f.close()
            print('Enhanced document created. See enhanced.xml')
    else:
        print('Problem reading document')
Esempio n. 7
0
def snip_illustrations(zp, filename, id, vol, page):
  try:
    img_color = cv2.imread(extract_jp2(zp, filename))
    delete_jp2(filename)
    identifier, title, author, pubplace, publisher, guesseddate = parse_xml(id)
    if not os.path.exists(os.path.join(cover_path, guesseddate)):
      os.mkdir(os.path.join(cover_path, guesseddate))
    cv2.imwrite(os.path.join(cover_path, guesseddate, "{0}_{1}_{2}_{3}.jpg".format(id, vol, make_safe(title), guesseddate)), img_color, [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93] )
    print("{0}_{1}_{2}_{3}.jpg".format(id, vol, make_safe(title), guesseddate))
  except KeyError, e:
    r.rpoplpush(workerid, "coverproblems")
Esempio n. 8
0
def read_objects(tracklet_file, num_frames):
    objects = []  # grouped by frames
    for n in range(num_frames):
        objects.append([])

    # read tracklets from file
    tracklets = parse_xml(tracklet_file)
    num = len(tracklets)

    for n in range(num):
        tracklet = tracklets[n]

        # this part is inspired by kitti object development kit matlab code: computeBox3D
        h, w, l = tracklet.size
        trackletBox = np.array([  # in velodyne coordinates around zero point and without orientation yet\
            [-l / 2, -l / 2, l / 2, l / 2, -l / 2, -l / 2, l / 2, l / 2],
            [w / 2, -w / 2, -w / 2, w / 2, w / 2, -w / 2, -w / 2, w / 2],
            [0.0, 0.0, 0.0, 0.0, h, h, h, h]])

        # loop over all data in tracklet
        t = tracklet.firstFrame
        for translation, rotation, state, occlusion, truncation, amtOcclusion, amtBorders, absoluteFrameNumber in tracklet:

            # determine if object is in the image; otherwise continue
            if truncation not in (TRUNC_IN_IMAGE, TRUNC_TRUNCATED):
                continue

            # re-create 3D bounding box in velodyne coordinate system
            yaw = rotation[2]  # other rotations are 0 in all xml files I checked
            assert np.abs(rotation[:2]).sum() == 0, 'object rotations other than yaw given!'
            rotMat = np.array([
                [np.cos(yaw), -np.sin(yaw), 0.0],
                [np.sin(yaw), np.cos(yaw), 0.0],
                [0.0, 0.0, 1.0]
            ])
            cornerPosInVelo = np.dot(rotMat, trackletBox) + np.tile(translation, (8, 1)).T

            # calc yaw as seen from the camera (i.e. 0 degree = facing away from cam), as opposed to
            #   car-centered yaw (i.e. 0 degree = same orientation as car).
            #   makes quite a difference for objects in periphery!
            # Result is in [0, 2pi]
            x, y, z = translation
            yawVisual = (yaw - np.arctan2(y, x)) % (2 * math.pi)

            o = type('', (), {})()
            o.box = cornerPosInVelo.transpose()
            o.type = tracklet.objectType
            o.tracklet_id = n
            objects[t].append(o)
            t = t + 1

    return objects
Esempio n. 9
0
def gather_signals(filepath):
  print("Gathering signals for '{0}'".format(filepath))
  filename = filepath.split("/")[-1]
  id, vol, page, imgno = breakdown_imagename(filename.split("/")[-1])
  metadata = parse_xml(id)
  cv_signals = []
  for cv_measure in CV_SIGNALS:
    signal = cv_measure(filepath)
    if isinstance(signal, tuple):
      cv_signals.extend(signal)
    else:
      cv_signals.append(signal)
  return {'metadata': metadata,
          'identifier': (id, vol, page, imgno),
          'cv_signals': cv_signals}
Esempio n. 10
0
def convert_component(path: Union[str, PathLike], mc=False) -> Path:
    root_file = Path(path)
    if not root_file.exists():
        raise FileNotFoundError(f'File does not exists: {root_file}')
    elif root_file.suffix != '.xml':
        raise ValueError(f'Expected xml file, got {root_file.suffix}')

    rename_to = root_file.with_suffix('.dot')
    if rename_to.exists():
        pass
        #return rename_to

    graph_repr = parse_xml(root_file)
    modal_to_dot(str(rename_to.absolute()), graph_repr, mc=mc)

    return rename_to
 def get_all_info(self, xmlfile):
     xml = parse_xml(xmlfile, show_log=False)
     #判断文件
     xml.xml_exists()
     xml.get_root()
     all = []
     childs = xml.get_element_children(xml.root)
     for child in childs:
         l = []
         cid = child.get("id")
         ip = child.get("ip")
         plat = child.get("platform")
         l.append(cid)
         l.append(ip)
         l.append(plat)
         all.append(l)
     return all
	def get_all_info(self,xmlfile):
		xml=parse_xml(xmlfile,show_log=False)
		#判断文件
		xml.xml_exists()		
		xml.get_root()
		all=[]
		childs=xml.get_element_children(xml.root)
		for child in childs:
			l=[]
			cid=child.get("id")
			ip=child.get("ip")
			plat=child.get("platform")
			l.append(cid)
			l.append(ip)
			l.append(plat)
			all.append(l)
		return all			
Esempio n. 13
0
def convert(args):
    default_options = {
        'input': '',
        'output': '',
        'mc': False,
        'auto_group': False
    }
    default_options.update(vars(args))

    ipath = Path(default_options['input'])
    opath = Path(default_options['output'])

    pprint(default_options)

    if not ipath.exists():
        log.error(f'Given path does not exists, can not read: {ipath}')
        exit(-1)

    if ipath.suffix not in ['.dot', '.xml']:
        log.error(f'Unknown suffix: {ipath.suffix}')
        exit(-1)
    if opath.suffix not in ['.dot', '.xml']:
        log.error(f'Unknown suffix: {opath.suffix}')
        exit(-1)

    mts_repr = None
    if ipath.suffix == '.dot':
        log.debug(f'Read input (dot): {ipath}')
        mts_repr = parse_dot(ipath)
    elif ipath.suffix == '.xml':
        log.debug(f'Read input (xml): {ipath}')
        mts_repr = parse_xml(ipath)

    if opath.suffix == '.dot':
        log.info(f'Writing to {opath} ...')
        modal_to_dot(path=opath,
                     mts=mts_repr,
                     mc=default_options['mc'],
                     derive_groups=default_options['auto_group'])
    elif opath.suffix == '.xml':
        log.info(f'Writing to {opath} ...')
        modal_to_xml(path=opath,
                     mts=mts_repr,
                     mc=default_options['mc'],
                     derive_groups=default_options['auto_group'])
Esempio n. 14
0
def snip_illustrations(zp, filename, id, vol, page):
    try:
        img_color = cv2.imread(extract_jp2(zp, filename))
        delete_jp2(filename)
        identifier, title, author, pubplace, publisher, guesseddate = parse_xml(
            id)
        if not os.path.exists(os.path.join(cover_path, guesseddate)):
            os.mkdir(os.path.join(cover_path, guesseddate))
        cv2.imwrite(
            os.path.join(
                cover_path, guesseddate,
                "{0}_{1}_{2}_{3}.jpg".format(id, vol, make_safe(title),
                                             guesseddate)), img_color,
            [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93])
        print("{0}_{1}_{2}_{3}.jpg".format(id, vol, make_safe(title),
                                           guesseddate))
    except KeyError, e:
        r.rpoplpush(workerid, "coverproblems")
Esempio n. 15
0
def get_md(item):
    job, flickr_id = item.strip().split("\t")
    img = job.strip()[24:]
    location = os.path.join(embellishments_path, img)
    id, vol, page, _ = img.split("/", 1)[1].split("_", 3)
    metadata = parse_xml(id)
    identifier, title, author, pubplace, publisher, guesseddate = metadata
    decoded = map(lambda x: x.encode("utf-8"), [
        title, author, guesseddate, pubplace, publisher, vol,
        str(int(page)), identifier
    ])

    uploaded_title = u"Image taken from page {1} of '{0}'".format(
        decoded[0].decode("utf-8"), page.decode("utf-8"))
    try:
        uploaded_title = u"Image taken from page {1} of '{0}'".format(
            decoded[0].decode("utf-8"),
            str(int(page)).decode("utf-8"))
    except:
        pass

    if os.path.exists(location):
        try:
            rosetta = r.get("s:" + id)
            if rosetta:
                d, a, u = rosetta.strip().split("\t")
                adjusted_ark_number = "{0:012x}".format(
                    int(a.split("_")[1], 16) - 1)
                adj_ark = "{0}_{1}".format(
                    a.split("_")[0], adjusted_ark_number.upper())
                hexpage = "0x000001"
                try:
                    hexpage = "{0:06x}".format(int(page)).upper()
                except:
                    pass
                additional = """
<ul><li>Open the page in the <a href="http://itemviewer.bl.uk/?itemid={2}#{1}.0x{0}">British Library's itemViewer (page: {3})</a></li>
<li><a href="http://access.dl.bl.uk/{2}">Download the PDF for this book</a>
""".format(hexpage, adj_ark, u, page)
                decoded += [additional]
            else:
                decoded += [u""]
        except Exception, e:
            print e
Esempio n. 16
0
def get_md(item):
  job, flickr_id = item.strip().split("\t")
  img = job.strip()[24:]
  location = os.path.join(embellishments_path, img)
  id, vol, page, _ = img.split("/",1)[1].split("_",3)
  metadata = parse_xml(id)
  identifier, title, author, pubplace, publisher, guesseddate = metadata
  decoded = map(lambda x: x.encode("utf-8"), [title, author, guesseddate, pubplace, publisher, vol, str(int(page)), identifier])
  
  uploaded_title=u"Image taken from page {1} of '{0}'".format(decoded[0].decode("utf-8"), page.decode("utf-8"))
  try:
    uploaded_title = u"Image taken from page {1} of '{0}'".format(decoded[0].decode("utf-8"), str(int(page)).decode("utf-8"))
  except:
    pass

  if os.path.exists(location):
    try:
      rosetta = r.get("s:"+id)
      if rosetta:
        d, a, u = rosetta.strip().split("\t")
        adjusted_ark_number = "{0:012x}".format(int(a.split("_")[1], 16) - 1)
        adj_ark = "{0}_{1}".format(a.split("_")[0], adjusted_ark_number.upper())
        hexpage = "0x000001"
        try:
          hexpage = "{0:06x}".format(int(page)).upper()
        except:
          pass
        additional = """
<ul><li>Open the page in the <a href="http://itemviewer.bl.uk/?itemid={2}#{1}.0x{0}">British Library's itemViewer (page: {3})</a></li>
<li><a href="http://access.dl.bl.uk/{2}">Download the PDF for this book</a>
""".format(hexpage, adj_ark, u, page)
        decoded += [additional]
      else:
        decoded += [u""]
    except Exception, e:
      print e
Esempio n. 17
0
def snip_illustrations(zp,
                       filename,
                       altoxmloldpath,
                       scale=1.1,
                       threshold=1200 * 600):
    id, vol, page = altopath_to_idvol(altoxmloldpath)
    altofilepath = old_winpath_to_vmpath(altoxmloldpath)
    doc = []
    with open(altofilepath, "r") as xmlfile:
        doc = ET.fromstring(xmlfile.read())
    page_shape, images = get_illustration_coords(doc)
    if images and [x for x in images if x[2] * x[3] < threshold]:
        counter = 0
        skippable = True
        for rect in images:
            if rect[2] * rect[3] < threshold:
                skippable = False

        if skippable:
            print("No suitable images found within '{0}' - skipping".format(
                filename))
            return
        else:
            print("Page contains a relevant illustration, not skipping.")
        img_color = ""
        #img_color = cv2.imread(extract_jp2(zp, filename))
        #delete_jp2(filename)
        #h,w,_ = img_color.shape
        #current_page_shape = (w,h)
        #dh = float(h)/float(page_shape[1])
        #dw = float(w)/float(page_shape[0])
        identifier, title, author, pubplace, publisher, guesseddate = parse_xml(
            id)
        for rect in images:
            counter += 1
            if rect[2] * rect[3] < threshold:
                print("Attempting to slice '{3}' from {0}_{1}_{2}".format(
                    id, vol, page, rect))
                #scaled = increase_size(rect, scale, (dw,dh), current_page_shape)
                #boundary = get_rect(scaled)
                if len(guesseddate) != 4:
                    guesseddate = "Unknown"
                if not os.path.exists(os.path.join(embellish_path,
                                                   guesseddate)):
                    os.mkdir(os.path.join(embellish_path, guesseddate))
                img_filename = os.path.join(
                    embellish_path, guesseddate,
                    "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(
                        id, vol, page, counter, make_safe(title), guesseddate))
                short_filename = os.path.join(
                    embellish_path, guesseddate,
                    "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(
                        id, vol, page, counter,
                        make_safe(title)[:80], guesseddate))
                # if not already captured this, switch to new short title and carry on
                if not os.path.exists(img_filename):
                    img_filename = short_filename
                    # Has this been captured before?
                    try:
                        if not os.path.exists(img_filename):
                            if img_color == "":
                                img_color = cv2.imread(
                                    extract_jp2(zp, filename))
                                delete_jp2(filename)
                            h, w, _ = img_color.shape
                            current_page_shape = (w, h)
                            dh = float(h) / float(page_shape[1])
                            dw = float(w) / float(page_shape[0])
                            scaled = increase_size(rect, scale, (dw, dh),
                                                   current_page_shape)
                            boundary = get_rect(scaled)

                            cv2.imwrite(
                                img_filename,
                                img_color[boundary[0][1]:boundary[1][1],
                                          boundary[0][0]:boundary[1][0]],
                                [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93])
                            print("{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(
                                id, vol, page, counter, make_safe(title),
                                guesseddate))
                            if os.path.isfile(img_filename):
                                r.lpush("secondpassembellish", img_filename)
                                noplates = r.llen("secondpassembellish")
                                print("{1} - {0}".format(
                                    img_filename, noplates))
                            else:
                                print("CV2 SECRETLY FAILED...")
                                r.lpush("embellisherror", img_filename)
                        else:
                            print("Already captured, skipping")

                    except:
                        cv2.imwrite(
                            os.path.join(
                                embellish_path, guesseddate,
                                "{0}_{1}_{2}_{3}_{4}.jpg".format(
                                    id, vol, page, counter, guesseddate)),
                            img_color[boundary[0][1]:boundary[1][1],
                                      boundary[0][0]:boundary[1][0]],
                            [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93])
                        print("{0}_{1}_{2}_{3}_{4}.jpg".format(
                            id, vol, page, counter, guesseddate))
                else:
                    print("Already captured, skipping")
    else:
        print(
            "All illustration areas in {1}(vol:{2}, pg: {3} are above threshold area size (currently: {0})"
            .format(threshold, id, vol, page))
Esempio n. 18
0
import flickr_api
a = flickr_api.auth.AuthHandler.load("BLLibraryAuth")
flickr_api.set_auth_handler(a)

if __name__ == "__main__":
  workerid = "uploadwrk"+sys.argv[1]
  while(True):
    job = get_job(workerid)
    while(job):
      if job != "":
        img = job.strip()
        location = os.path.join(embellishments_path, img)
        # get id
        id, vol, page, _ = img.split("/",1)[1].split("_",3)
        metadata = parse_xml(id)
        """
  Arguments:
        photo_file
            The file to upload.
        title (optional)
            The title of the photo.
        description (optional)
            A description of the photo. May contain some limited HTML.
        tags (optional)
            A space-seperated list of tags to apply to the photo.
        is_public, is_friend, is_family (optional)
            Set to 0 for no, 1 for yes. Specifies who can view the photo.
        safety_level (optional)
            Set to 1 for Safe, 2 for Moderate, or 3 for Restricted.
        content_type (optional)
Esempio n. 19
0
def snip_illustrations(zp, filename, altoxmloldpath, scale = 1.1, threshold = 1200*600, zipfilename=""):
    id, vol, page = altopath_to_idvol(altoxmloldpath)
    start, end = get_extent(id, zipfilename)
    if int(page) == start or int(page) == end:
      print("This looks like the book cover - page '{0}' - extent ({1}, {2})".format(int(page), start, end))
      return
    
    altofilepath = old_winpath_to_vmpath(altoxmloldpath)
    doc = []
    with open(altofilepath, "r") as xmlfile:
      doc = ET.fromstring(xmlfile.read())
    page_shape, images = get_illustration_coords(doc)
    if images and [x for x in images if x[2]*x[3] > threshold]:
      counter = 0
      img_color = cv2.imread(extract_jp2(zp, filename))
      delete_jp2(filename)
      h,w,_ = img_color.shape
      current_page_shape = (w,h)
      dh = float(h)/float(page_shape[1])
      dw = float(w)/float(page_shape[0])
      identifier, title, author, pubplace, publisher, guesseddate = parse_xml(id)
      for rect in images:
        counter += 1
        if rect[2]*rect[3] > threshold:
          print("Attempting to slice '{1}' from {0}".format(filename, rect))
          scaled = increase_size(rect, scale, (dw,dh), current_page_shape)
          boundary = get_rect(scaled)
          if len(guesseddate) != 4:
            guesseddate = "Unknown"
          if not os.path.exists(os.path.join(embellish_path, guesseddate)):
            os.mkdir(os.path.join(embellish_path, guesseddate))
          img_filename = os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(id, vol, page, counter, make_safe(title), guesseddate))
          short_filename = os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(id, vol, page, counter, make_safe(title)[:80], guesseddate))
          # if not already captured this, switch to new short title and carry on
          if not os.path.exists(img_filename):
            img_filename = short_filename
          # Has this been captured before?
          try:
            if not os.path.exists(img_filename):
              cv2.imwrite(img_filename, img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93] )
              if os.path.isfile(img_filename):
                r.lpush(plateimgs, img_filename)
                noplates = r.llen(plateimgs)
                print("Plate no: {1} - {0}".format(img_filename, noplates))
              else:
                print("CV2 SECRETLY FAILED...")
                r.lpush("plateerror", altoxmloldpath)
            else:
              print("Already captured, skipping")
          except:
            img_filename = os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}.jpg".format(id, vol, page, counter, guesseddate))
            cv2.imwrite(img_filename, img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93])
            if os.path.isfile(img_filename):
              r.lpush(plateimgs, img_filename)
              noplates = r.llen(plateimgs)
              print("Plate no: {1} - {0}".format(img_filename, noplates))
            else:
              print("CV2 SECRETLY FAILED...")
              r.lpush("plateerror", altoxmloldpath)
    else:
      print("All illustration areas in {1}(vol:{2}, pg: {3} are below threshold area size (currently: {0})".format(threshold, id, vol, page))
def snip_illustrations(zp, filename, altoxmloldpath, scale = 1.1, threshold = 1200*600):
    id, vol, page = altopath_to_idvol(altoxmloldpath)
    altofilepath = old_winpath_to_vmpath(altoxmloldpath)
    doc = []
    with open(altofilepath, "r") as xmlfile:
      doc = ET.fromstring(xmlfile.read())
    page_shape, images = get_illustration_coords(doc)
    if images and [x for x in images if x[2]*x[3] < threshold]:
      counter = 0
      skippable = True
      for rect in images:
        if rect[2]*rect[3] < threshold:
          skippable = False

      if skippable:
        print("No suitable images found within '{0}' - skipping".format(filename))
        return
      else:
        print("Page contains a relevant illustration, not skipping.")
      img_color = ""
      #img_color = cv2.imread(extract_jp2(zp, filename))
      #delete_jp2(filename)
      #h,w,_ = img_color.shape
      #current_page_shape = (w,h)
      #dh = float(h)/float(page_shape[1])
      #dw = float(w)/float(page_shape[0])
      identifier, title, author, pubplace, publisher, guesseddate = parse_xml(id)
      for rect in images:
        counter += 1
        if rect[2]*rect[3] < threshold:
          print("Attempting to slice '{3}' from {0}_{1}_{2}".format(id, vol, page, rect))
          #scaled = increase_size(rect, scale, (dw,dh), current_page_shape)
          #boundary = get_rect(scaled)
          if len(guesseddate) != 4:
            guesseddate = "Unknown"
          if not os.path.exists(os.path.join(embellish_path, guesseddate)):
            os.mkdir(os.path.join(embellish_path, guesseddate))
          img_filename = os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(id, vol, page, counter, make_safe(title), guesseddate))
          short_filename = os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(id, vol, page, counter, make_safe(title)[:80], guesseddate))
          # if not already captured this, switch to new short title and carry on
          if not os.path.exists(img_filename):
            img_filename = short_filename
            # Has this been captured before? 
            try:
              if not os.path.exists(img_filename):
                if img_color == "":
                  img_color = cv2.imread(extract_jp2(zp, filename))
                  delete_jp2(filename)
                h,w,_ = img_color.shape
                current_page_shape = (w,h)
                dh = float(h)/float(page_shape[1])
                dw = float(w)/float(page_shape[0])
                scaled = increase_size(rect, scale, (dw,dh), current_page_shape)
                boundary = get_rect(scaled)

                cv2.imwrite(img_filename, img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93] )
                print("{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(id, vol, page, counter, make_safe(title), guesseddate))
                if os.path.isfile(img_filename):
                  r.lpush("secondpassembellish", img_filename)
                  noplates = r.llen("secondpassembellish")
                  print("{1} - {0}".format(img_filename, noplates))
                else:
                  print("CV2 SECRETLY FAILED...")
                  r.lpush("embellisherror", img_filename)
              else:
                print("Already captured, skipping")
 
            except:
              cv2.imwrite(os.path.join(embellish_path, guesseddate, "{0}_{1}_{2}_{3}_{4}.jpg".format(id, vol, page, counter, guesseddate)),
                          img_color[boundary[0][1]:boundary[1][1], boundary[0][0]:boundary[1][0]], [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93])
              print("{0}_{1}_{2}_{3}_{4}.jpg".format(id, vol, page, counter, guesseddate))
          else:
            print("Already captured, skipping")
    else:  
      print("All illustration areas in {1}(vol:{2}, pg: {3} are above threshold area size (currently: {0})".format(threshold, id, vol, page))
Esempio n. 21
0
def snip_illustrations(zp,
                       filename,
                       altoxmloldpath,
                       scale=1.1,
                       threshold=1200 * 600,
                       zipfilename=""):
    id, vol, page = altopath_to_idvol(altoxmloldpath)
    start, end = get_extent(id, zipfilename)
    if int(page) == start or int(page) == end:
        print(
            "This looks like the book cover - page '{0}' - extent ({1}, {2})".
            format(int(page), start, end))
        return

    altofilepath = old_winpath_to_vmpath(altoxmloldpath)
    doc = []
    with open(altofilepath, "r") as xmlfile:
        doc = ET.fromstring(xmlfile.read())
    page_shape, images = get_illustration_coords(doc)
    if images and [x for x in images if x[2] * x[3] > threshold]:
        counter = 0
        img_color = cv2.imread(extract_jp2(zp, filename))
        delete_jp2(filename)
        h, w, _ = img_color.shape
        current_page_shape = (w, h)
        dh = float(h) / float(page_shape[1])
        dw = float(w) / float(page_shape[0])
        identifier, title, author, pubplace, publisher, guesseddate = parse_xml(
            id)
        for rect in images:
            counter += 1
            if rect[2] * rect[3] > threshold:
                print("Attempting to slice '{1}' from {0}".format(
                    filename, rect))
                scaled = increase_size(rect, scale, (dw, dh),
                                       current_page_shape)
                boundary = get_rect(scaled)
                if len(guesseddate) != 4:
                    guesseddate = "Unknown"
                if not os.path.exists(os.path.join(embellish_path,
                                                   guesseddate)):
                    os.mkdir(os.path.join(embellish_path, guesseddate))
                img_filename = os.path.join(
                    embellish_path, guesseddate,
                    "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(
                        id, vol, page, counter, make_safe(title), guesseddate))
                short_filename = os.path.join(
                    embellish_path, guesseddate,
                    "{0}_{1}_{2}_{3}_{4}_{5}.jpg".format(
                        id, vol, page, counter,
                        make_safe(title)[:80], guesseddate))
                # if not already captured this, switch to new short title and carry on
                if not os.path.exists(img_filename):
                    img_filename = short_filename
                # Has this been captured before?
                try:
                    if not os.path.exists(img_filename):
                        cv2.imwrite(
                            img_filename,
                            img_color[boundary[0][1]:boundary[1][1],
                                      boundary[0][0]:boundary[1][0]],
                            [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93])
                        if os.path.isfile(img_filename):
                            r.lpush(plateimgs, img_filename)
                            noplates = r.llen(plateimgs)
                            print("Plate no: {1} - {0}".format(
                                img_filename, noplates))
                        else:
                            print("CV2 SECRETLY FAILED...")
                            r.lpush("plateerror", altoxmloldpath)
                    else:
                        print("Already captured, skipping")
                except:
                    img_filename = os.path.join(
                        embellish_path, guesseddate,
                        "{0}_{1}_{2}_{3}_{4}.jpg".format(
                            id, vol, page, counter, guesseddate))
                    cv2.imwrite(
                        img_filename, img_color[boundary[0][1]:boundary[1][1],
                                                boundary[0][0]:boundary[1][0]],
                        [cv2.cv.CV_IMWRITE_JPEG_QUALITY, 93])
                    if os.path.isfile(img_filename):
                        r.lpush(plateimgs, img_filename)
                        noplates = r.llen(plateimgs)
                        print("Plate no: {1} - {0}".format(
                            img_filename, noplates))
                    else:
                        print("CV2 SECRETLY FAILED...")
                        r.lpush("plateerror", altoxmloldpath)
    else:
        print(
            "All illustration areas in {1}(vol:{2}, pg: {3} are below threshold area size (currently: {0})"
            .format(threshold, id, vol, page))
Esempio n. 22
0
r = Redis()

wq = "q"
output = "output"

def get_job(workerid):
  if r.llen(workerid) == 0:
    status = r.rpoplpush(wq, workerid)
    return status
  else:
    return r.lrange(workerid, 0, 0)[0]

def clear_job(workerid, job):
  r.lrem(workerid, job, 1)

if __name__ == "__main__":
  workerid = "wrk"+sys.argv[1]
  while(True):
    job = get_job(workerid)
    while(job):
      try:
        row = parse_xml(job)
        r.lpush(output, u"\t".join(row).encode("utf-8"))
        clear_job(workerid, job)
      except OSError, e:
        clear_job(workerid, job)
      job = get_job(workerid)
    print("%s ran out of jobs - waiting for 10 seconds before checking again" % workerid)
    time.sleep(10)