def extractfeatures_from_file(data): kmldom = XML(data) ns = kmldom.tag.strip('kml') points = kmldom.findall('.//%sPoint' % ns) lines = kmldom.findall('.//%sLineString' % ns) polygons = kmldom.findall('.//%sPolygon' % ns) mpoint = [] mline =[] mpoly = [] for point in points: coordinates = point.findall('.//%scoordinates' % ns) for coordinate in coordinates: latlon = coordinate.text.strip().split(',') coords = [float(c) for c in latlon] try: p = Point(coords) mpoint.append(p) except: logger.info('invalid point geometry: %s' % coordinates[:10] ) for line in lines: coordinates = line.findall('.//%scoordinates' % ns) for coordinate in coordinates: latlons = coordinate.text.split() coords = [] for latlon in latlons: coords.append([float(c) for c in latlon.split(',')]) try: l = LineString(coords) mline.append(l) except: logger.info('invalid linestring geometry: %s' % coordinates[:10] ) for polygon in polygons: coordinates = polygon.findall('.//%scoordinates' % ns) for coordinate in coordinates: latlons = coordinate.text.split() coords = [] for latlon in latlons: coords.append([float(c) for c in latlon.split(',')]) try: l = Polygon(coords) mpoly.append(l) except: logger.info('invalid polygon geometry: %s' % coordinates[:10] ) result = {'MultiPoint':None, 'MultiLineString':None, 'MultiPolygon':None} if mpoint: result['MultiPoint'] = MultiPoint(mpoint) if mline: result['MultiLineString'] = MultiLineString(mline) if mpoly: result['MultiPolygon'] = MultiPolygon(mpoly) return result
def convert(self, data, cache, **kwargs): bodydom = Element('div') kmldom = XML(data) ns = kmldom.tag.strip('kml') placemarks = kmldom.findall('.//%sPlacemark' % ns) for placemark in placemarks: titles = placemark.findall(ns + 'name') for title in titles: t = Element('h2') t.text = title.text bodydom.append(t) descriptions = placemark.findall(ns+'description') for desc in descriptions: if desc.text: try: text = desc.text.encode('ascii', 'xmlcharrefreplace').strip() except: text = desc.text.strip() text = sanitize(text) d = XML('<div>' + text.encode('ascii', 'xmlcharrefreplace') + '</div>') bodydom.append(d) body = tostring(bodydom) cache.setData(body) return cache
def handler(req): url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=' ## uris should be of the form ## http://rguha.ath.cx/~rguha/cicc/rest/depict/SMILES uriParts = req.uri.split('/') ids = ','.join([x.strip() for x in uriParts[-1].split(',')]) url = url+ids if req.method not in ['GET']: req.err_headers_out['Allow'] = 'GET' raise apache.SERVER_RETURN, apache.HTTP_METHOD_NOT_ALLOWED result = '' headingCounts = {} narticle = 0 data = ''.join(urllib.urlopen(url).readlines()) doc = XML(data) for article in doc.findall('PubmedArticle'): narticle += 1 for mh in article.findall('MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName'): if mh.text in headingCounts.keys(): headingCounts[mh.text] += 1 else: headingCounts[mh.text] = 1 ## most frequent first headingCounts = sorted(headingCounts.items(), key = operator.itemgetter(1), reverse=True) for key,item in headingCounts: result += '%s # %d/%d\n' % (key, item, narticle) req.content_type = 'text/plain'; req.write(result) return apache.OK
def convert(self, data, cache, **kwargs): bodydom = Element('div') kmldom = XML(data) ns = kmldom.tag.strip('kml') placemarks = kmldom.findall('.//%sPlacemark' % ns) for placemark in placemarks: titles = placemark.findall(ns + 'name') for title in titles: t = Element('h2') t.text = title.text bodydom.append(t) descriptions = placemark.findall(ns + 'description') for desc in descriptions: if desc.text: try: text = desc.text.encode('ascii', 'xmlcharrefreplace').strip() except: text = desc.text.strip() text = sanitize(text) d = XML('<div>' + text.encode('ascii', 'xmlcharrefreplace') + '</div>') bodydom.append(d) body = tostring(bodydom) cache.setData(body) return cache
def extract_title(data): kmldom = XML(data) ns = kmldom.tag.strip('kml') titles = kmldom.findall('.//%sname' % ns) if titles: return titles[0].text.strip() else: return 'N/A'
def extract_description(data): kmldom = XML(data) ns = kmldom.tag.strip('kml') descriptions = kmldom.findall('.//%sdescription' % ns) desc = '' for description in descriptions: if 'Double click to zoom in' != description.text.strip(): desc += description.text.strip() return desc
class EpubDocument(object): """A class that parses and provides data about an ePub file""" def __init__(self, fname): # This is done according to this: # http://stackoverflow.com/questions/1388467/reading-epub-format print(("Opening:", fname)) try: self.book = zipfile.ZipFile(fname, "r") except zipfile.BadZipfile: raise ValueError("Invalid format") f = self.book.open('META-INF/container.xml') self.container = XML(f.read()) f.close() roots = self.container.findall( './/{urn:oasis:names:tc:opendocument:xmlns:container}rootfile') self.roots = [] for r in roots: self.roots.append(r.attrib['full-path']) opf = self.book.open(self.roots[0]) self.basepath = os.path.dirname(self.roots[0]) + "/" if self.basepath == '/': self.basepath = "" print(("BASEPATH:", self.basepath)) data = opf.read() self.opf = XML(data) opf.close() self.manifest = self.opf.find('{http://www.idpf.org/2007/opf}manifest') self.manifest_dict = {} for elem in self.manifest.findall( '{http://www.idpf.org/2007/opf}item'): self.manifest_dict[elem.attrib['id']] = self.basepath + \ elem.attrib['href'] self.spine = self.opf.find('{http://www.idpf.org/2007/opf}spine') self.tocentries = [] self.toc_id = self.spine.attrib.get('toc', None) if self.toc_id: self.toc_fn = self.manifest_dict[self.toc_id] print(("TOC:", self.toc_fn)) f = self.book.open(self.toc_fn) data = f.read() self.toc = XML(data) self.navmap = self.toc.find( '{http://www.daisy.org/z3986/2005/ncx/}navMap') # FIXME: support nested navpoints self.navpoints = self.navmap.findall( './/{http://www.daisy.org/z3986/2005/ncx/}navPoint') for np in self.navpoints: label = np.find( '{http://www.daisy.org/z3986/2005/ncx/}navLabel').find( '{http://www.daisy.org/z3986/2005/ncx/}text').text content = np.find( '{http://www.daisy.org/z3986/2005/ncx/}content').attrib['src'] if label and content: self.tocentries.append([label, content]) self.itemrefs = self.spine.findall( '{http://www.idpf.org/2007/opf}itemref') print(("IR:", self.itemrefs)) self.spinerefs = [ self.manifest_dict[item.attrib['idref']][len(self.basepath):] for item in self.itemrefs] # I found one book that has a spine but no navmap: # "Der schwarze Baal" from manybooks.net # Also another has more entries on the spine than on the navmap # (Dinosauria, from feedbooks). # So, we need to merge these suckers. I will assume it's not completely # insane and the spine is always more complete. spinerefs2 = [[x, x] for x in self.spinerefs] for te in self.tocentries: idx = self.spinerefs.index(te[1]) spinerefs2[idx] = te self.tocentries = spinerefs2 # if not self.tocentries: # # Alternative toc # self.tocentries = [[item.attrib['idref'], #self.manifest_dict[item.attrib['idref']][len(self.basepath):]] #for item in self.itemrefs] print((self.tocentries)) print((self.spinerefs)) def getData(self, path): """Return the contents of a file in the document""" path = "%s%s" % (self.basepath, path) try: f = self.book.open(path) except KeyError: # File missing in the zip return [] data = f.read() f.close() return data