def extractfeatures_from_file(data):
    kmldom = XML(data)
    ns = kmldom.tag.strip('kml')
    points = kmldom.findall('.//%sPoint' % ns)
    lines = kmldom.findall('.//%sLineString' % ns)
    polygons = kmldom.findall('.//%sPolygon' % ns)
    mpoint = []
    mline =[]
    mpoly = []
    for point in points:
        coordinates = point.findall('.//%scoordinates' % ns)
        for coordinate in coordinates:
            latlon = coordinate.text.strip().split(',')
            coords = [float(c) for c in latlon]
            try:
                p = Point(coords)
                mpoint.append(p)
            except:
                logger.info('invalid point geometry: %s' % coordinates[:10] )

    for line in lines:
        coordinates = line.findall('.//%scoordinates' % ns)
        for coordinate in coordinates:
            latlons = coordinate.text.split()
            coords = []
            for latlon in latlons:
                coords.append([float(c) for c in latlon.split(',')])
            try:
                l = LineString(coords)
                mline.append(l)
            except:
                logger.info('invalid linestring geometry: %s' % coordinates[:10] )

    for polygon in polygons:
        coordinates = polygon.findall('.//%scoordinates' % ns)
        for coordinate in coordinates:
            latlons = coordinate.text.split()
            coords = []
            for latlon in latlons:
                coords.append([float(c) for c in latlon.split(',')])
            try:
                l = Polygon(coords)
                mpoly.append(l)

            except:
                logger.info('invalid polygon geometry: %s' % coordinates[:10] )

    result = {'MultiPoint':None, 'MultiLineString':None, 'MultiPolygon':None}
    if mpoint:
        result['MultiPoint'] =  MultiPoint(mpoint)
    if mline:
        result['MultiLineString'] = MultiLineString(mline)
    if mpoly:
        result['MultiPolygon'] = MultiPolygon(mpoly)


    return result
    def convert(self, data, cache, **kwargs):
        bodydom = Element('div')
        kmldom = XML(data)
        ns = kmldom.tag.strip('kml')
        placemarks = kmldom.findall('.//%sPlacemark' % ns)
        for placemark in placemarks:
            titles = placemark.findall(ns + 'name')
            for title in titles:
                t = Element('h2')
                t.text = title.text
                bodydom.append(t)

            descriptions = placemark.findall(ns+'description')
            for desc in descriptions:
                if desc.text:
                    try:
                        text = desc.text.encode('ascii', 'xmlcharrefreplace').strip()
                    except:
                        text = desc.text.strip()
                    text = sanitize(text)
                    d = XML('<div>' + text.encode('ascii', 'xmlcharrefreplace') + '</div>')
                    bodydom.append(d)

        body = tostring(bodydom)
        cache.setData(body)
        return cache
Exemple #3
0
def handler(req):
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id='
    
    ## uris should be of the form
    ## http://rguha.ath.cx/~rguha/cicc/rest/depict/SMILES
    uriParts = req.uri.split('/')
    ids = ','.join([x.strip() for x in uriParts[-1].split(',')])
    url = url+ids

    if req.method not in ['GET']:
        req.err_headers_out['Allow'] = 'GET'
        raise apache.SERVER_RETURN, apache.HTTP_METHOD_NOT_ALLOWED

    result = ''
    headingCounts = {}
    narticle = 0
    
    data = ''.join(urllib.urlopen(url).readlines())
    doc = XML(data)
    for article in doc.findall('PubmedArticle'):
        narticle += 1
        for mh in article.findall('MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName'):
            if mh.text in headingCounts.keys():
                headingCounts[mh.text] += 1
            else:
                headingCounts[mh.text] = 1

    ## most frequent first
    headingCounts = sorted(headingCounts.items(), key = operator.itemgetter(1), reverse=True)
    for key,item in headingCounts:
        result += '%s # %d/%d\n' % (key, item, narticle)
    
    req.content_type = 'text/plain';
    req.write(result)
    return apache.OK
Exemple #4
0
    def convert(self, data, cache, **kwargs):
        bodydom = Element('div')
        kmldom = XML(data)
        ns = kmldom.tag.strip('kml')
        placemarks = kmldom.findall('.//%sPlacemark' % ns)
        for placemark in placemarks:
            titles = placemark.findall(ns + 'name')
            for title in titles:
                t = Element('h2')
                t.text = title.text
                bodydom.append(t)

            descriptions = placemark.findall(ns + 'description')
            for desc in descriptions:
                if desc.text:
                    try:
                        text = desc.text.encode('ascii',
                                                'xmlcharrefreplace').strip()
                    except:
                        text = desc.text.strip()
                    text = sanitize(text)
                    d = XML('<div>' +
                            text.encode('ascii', 'xmlcharrefreplace') +
                            '</div>')
                    bodydom.append(d)

        body = tostring(bodydom)
        cache.setData(body)
        return cache
Exemple #5
0
def extract_title(data):
    kmldom = XML(data)
    ns = kmldom.tag.strip('kml')
    titles = kmldom.findall('.//%sname' % ns)
    if titles:
        return titles[0].text.strip()
    else:
        return 'N/A'
Exemple #6
0
def extract_description(data):
    kmldom = XML(data)
    ns = kmldom.tag.strip('kml')
    descriptions = kmldom.findall('.//%sdescription' % ns)
    desc = ''
    for description in descriptions:
        if 'Double click to zoom in' != description.text.strip():
            desc += description.text.strip()

    return desc
Exemple #7
0
class EpubDocument(object):
    """A class that parses and provides
    data about an ePub file"""

    def __init__(self, fname):
        # This is done according to this:
        # http://stackoverflow.com/questions/1388467/reading-epub-format

        print(("Opening:", fname))
        try:
            self.book = zipfile.ZipFile(fname, "r")
        except zipfile.BadZipfile:
            raise ValueError("Invalid format")

        f = self.book.open('META-INF/container.xml')
        self.container = XML(f.read())
        f.close()
        roots = self.container.findall(
                './/{urn:oasis:names:tc:opendocument:xmlns:container}rootfile')
        self.roots = []
        for r in roots:
            self.roots.append(r.attrib['full-path'])
        opf = self.book.open(self.roots[0])
        self.basepath = os.path.dirname(self.roots[0]) + "/"
        if self.basepath == '/':
            self.basepath = ""
        print(("BASEPATH:", self.basepath))

        data = opf.read()
        self.opf = XML(data)
        opf.close()
        self.manifest = self.opf.find('{http://www.idpf.org/2007/opf}manifest')
        self.manifest_dict = {}
        for elem in self.manifest.findall(
                            '{http://www.idpf.org/2007/opf}item'):
            self.manifest_dict[elem.attrib['id']] = self.basepath + \
                                                    elem.attrib['href']

        self.spine = self.opf.find('{http://www.idpf.org/2007/opf}spine')

        self.tocentries = []
        self.toc_id = self.spine.attrib.get('toc', None)
        if self.toc_id:
            self.toc_fn = self.manifest_dict[self.toc_id]
            print(("TOC:", self.toc_fn))
            f = self.book.open(self.toc_fn)
            data = f.read()
            self.toc = XML(data)
            self.navmap = self.toc.find(
                            '{http://www.daisy.org/z3986/2005/ncx/}navMap')
            # FIXME: support nested navpoints
            self.navpoints = self.navmap.findall(
                        './/{http://www.daisy.org/z3986/2005/ncx/}navPoint')
            for np in self.navpoints:
                label = np.find(
                    '{http://www.daisy.org/z3986/2005/ncx/}navLabel').find(
                            '{http://www.daisy.org/z3986/2005/ncx/}text').text
                content = np.find(
                 '{http://www.daisy.org/z3986/2005/ncx/}content').attrib['src']
                if label and content:
                    self.tocentries.append([label, content])

        self.itemrefs = self.spine.findall(
                                    '{http://www.idpf.org/2007/opf}itemref')
        print(("IR:", self.itemrefs))
        self.spinerefs = [
            self.manifest_dict[item.attrib['idref']][len(self.basepath):]
                                                    for item in self.itemrefs]
        # I found one book that has a spine but no navmap:
        # "Der schwarze Baal" from manybooks.net
        # Also another has more entries on the spine than on the navmap
        # (Dinosauria, from feedbooks).
        # So, we need to merge these suckers. I will assume it's not completely
        # insane and the spine is always more complete.

        spinerefs2 = [[x, x] for x in self.spinerefs]

        for te in self.tocentries:
            idx = self.spinerefs.index(te[1])
            spinerefs2[idx] = te

        self.tocentries = spinerefs2
        # if not self.tocentries:
            # # Alternative toc
            # self.tocentries = [[item.attrib['idref'],
             #self.manifest_dict[item.attrib['idref']][len(self.basepath):]]
                                                    #for item in self.itemrefs]

        print((self.tocentries))
        print((self.spinerefs))

    def getData(self, path):
        """Return the contents of a file in the document"""

        path = "%s%s" % (self.basepath, path)
        try:
            f = self.book.open(path)
        except KeyError:  # File missing in the zip
            return []
        data = f.read()
        f.close()
        return data