def handle(self, req): path = req.uri[5:] form = FieldStorage(req) strict = form.get('strict', True) if strict in ['false', 'False', '0', None, '']: strict = False mt = form.get('mimeType', '') mt = mt.replace(' ', '+') if not mt: xtn = form.get('extension', '') if xtn: if not srlzHash.has_key(xtn): # can't continue raise ValueError(xtn) else: mt = srlzHash[xtn].mimeType if not mt: try: wanted = req.headers_in['Accept'] mts = conneg.parse(wanted) mt = conneg.best(mts, mimeList) except: mt = '' if mt: xtn = mimeHash[str(mt)] else: # default to rdf/xml xtn = "rdf.xml" srlz = srlzHash[xtn] if form.has_key('aggregation'): uri = form.get('aggregation') else: uri = path if not uri: data = '<html><body>Instructions etc. goes here</body></html>' self.send(data, req, ct="text/html"); return elif not protoUriRe.match(uri): self.error("Resource Map URI must be a protocol based URI", req) return try: # fetch rd = ReMDocument(uri) except Exception, e: self.error("Could not retrieve Resource Map from '%s': %s" % (uri, e.message), req) return
def __init__(self, uri, data='', filename='', mimeType='', format='', accept=''): self.uri = uri if data: self.data = data elif filename: if os.path.exists(filename): fh = file(filename) self.data = fh.read() fh.close() else: # try to fetch uri try: req = urllib.request.Request(uri) if accept: # add custom accept header req.add_header('Accept', accept) else: # otherwise add default req.add_header('Accept', accept_header) fh = urllib.request.urlopen(req) self.data = fh.read() self.info = fh.info() mimeType = self.info.dict.get('content-type', mimeType) self.uri = fh.geturl() fh.close() except: raise OreException( 'ReMDocument must either have data or filename') if not format: try: mt = conneg.parse(mimeType) if mt: mimeType = mt[0].mimetype1 + '/' + mt[0].mimetype2 except: pass mimeHash = { 'application/atom+xml': 'atom', 'application/xhtml+xml': 'rdfa', 'application/rdf+xml': 'xml', 'text/plain': 'nt', # yes, really 'text/rdf+n3': 'n3', 'application/x-turtle': 'turtle', 'application/rdf+nt': 'nt' } format = mimeHash.get(mimeType, '') self.mimeType = mimeType self.format = format StringIO.__init__(self, self.data)
def crawl(uri, src): if uri not in pageHash: pid = len(pageHash) pageHash[uri] = pid else: pid = pageHash[uri] linkHash = webGraphs[-1] if pid not in linkHash: linkHash[pid] = [] else: return print("processing %s->%s: %s" % (src, pid, uri)) if src != -1: linkHash[src].append(pid) #fetch, find links, record, crawl try: fh = urllib.request.urlopen(uri) except: print("... BROKEN") return ar = AggregatedResource(uri) ct = fh.headers['content-type'] try: cl = fh.headers['content-length'] ar._dc.extent = Literal(cl) except: pass try: lm = fh.headers['last-modified'] ar._dcterms.modified = Literal(lm) except: pass mt = conneg.parse(ct) if mt: ct = mt[0].mimetype1 + '/' + mt[0].mimetype2 ar._dc.format = Literal(ct) if ct != 'text/html': aggr.add_resource(ar) try: contentTypes[ct] += 1 except KeyError: contentTypes[ct] = 1 return data = fh.read() fh.close() # hash page for redirects/duplicates etc md5 = hashlib.new('md5') md5.update(data) hd = md5.hexdigest() if hd in md5Hash: print("%s == %s" % (pid, md5Hash[hd])) return else: md5Hash[hd] = pid # only add it here aggr.add_resource(ar) try: dom = etree.parse(io.StringIO(data), parser) except: print(" --- failed to parse") return title = dom.xpath('//title/text()') if title: ar._dc.title = Literal(title[0]) links = dom.xpath('//a/@href') frames = dom.xpath('//frame/@src') links.extend(frames) imgs = dom.xpath('//img/@src') links.extend(imgs) css = dom.xpath('//link/@href') links.extend(css) for l in links: l = l.strip() if l.find('#') > -1: l = l[:l.find('#')] if not l: # was just a hash URL continue if l[0] == "/": l = urllib.parse.urljoin(uri, l) elif l[:7].lower() != "http://" and l[:8].lower() != "https://": # check other protocols if nonHttpRe.search(l): continue # put in current directory l = urllib.parse.urljoin(uri, l) # check if we really want to crawl... if nonHtmlRe.search(l): # ignore common stuff # print "Skipping: %s" % chk pass elif l in pageHash: # ignore already done # print "Skipping: %s" % chk pass else: match = 1 for t in restrictTemplates: if not t.match(l): match = 0 break if match: stack.append((l, pid))
def crawl(uri, src): if not pageHash.has_key(uri): pid = len(pageHash) pageHash[uri] = pid else: pid = pageHash[uri] linkHash = webGraphs[-1] if not linkHash.has_key(pid): linkHash[pid] = [] else: return print "processing %s->%s: %s" % (src, pid, uri) if src != -1: linkHash[src].append(pid) #fetch, find links, record, crawl try: fh = urllib.urlopen(uri) except: print "... BROKEN" return ar = AggregatedResource(uri) ct = fh.headers['content-type'] try: cl = fh.headers['content-length'] ar._dc.extent = Literal(cl) except: pass try: lm = fh.headers['last-modified'] ar._dcterms.modified = Literal(lm) except: pass mt = conneg.parse(ct) if mt: ct = mt[0].mimetype1 + '/' + mt[0].mimetype2 ar._dc.format = Literal(ct) if ct != 'text/html': aggr.add_resource(ar) try: contentTypes[ct] += 1 except KeyError: contentTypes[ct] = 1 return data = fh.read() fh.close() # hash page for redirects/duplicates etc md5 = hashlib.new('md5') md5.update(data) hd = md5.hexdigest() if md5Hash.has_key(hd): print "%s == %s" % (pid, md5Hash[hd]) return else: md5Hash[hd] = pid # only add it here aggr.add_resource(ar) try: dom = etree.parse(StringIO.StringIO(data), parser) except: print " --- failed to parse" return title = dom.xpath('//title/text()') if title: ar._dc.title = Literal(title[0]) links = dom.xpath('//a/@href') frames = dom.xpath('//frame/@src') links.extend(frames) imgs = dom.xpath('//img/@src') links.extend(imgs) css = dom.xpath('//link/@href') links.extend(css) for l in links: l = l.strip() if l.find('#') > -1: l = l[:l.find('#')] if not l: # was just a hash URL continue if l[0] == "/": l = urlparse.urljoin(uri, l) elif l[:7].lower() != "http://" and l[:8].lower() != "https://": # check other protocols if nonHttpRe.search(l): continue # put in current directory l = urlparse.urljoin(uri, l) # check if we really want to crawl... if nonHtmlRe.search(l): # ignore common stuff # print "Skipping: %s" % chk pass elif pageHash.has_key(l): # ignore already done # print "Skipping: %s" % chk pass else: match = 1 for t in restrictTemplates: if not t.match(l): match = 0 break if match: stack.append((l, pid))
srlzHash['old-atom.xml'].mimeType = "application/atom+xml;version=0.9" srlzHash['pretty.xml'].mimeType += ";format=pretty" p = RdfLibParser() p.strict = True ap = AtomParser() p.strict = True rdfap = RdfAParser() p.strict = True mimeHash = {} for (k,v) in srlzHash.items(): mimeHash[v.mimeType] = k mimestr = ', '.join(mimeHash.keys()) mimeList = conneg.parse(mimestr) protoUriRe = re.compile("^([s]?http[s]?://|[t]?ftp:/|z39.50r:|gopher:|imap://|news:|nfs:|nntp:|rtsp:)") class validateHandler: def send(self, text, req, code=200, ct="text/xml"): req.content_type = ct req.content_length = len(text) req.send_http_header() if type(text) == unicode: req.write(text.encode('utf-8')) else: req.write(text) def error(self, msg, req): text = "<html><body><h3>Error</h3><p>%s</p></body></html>" % msg
def handle(self, req): path = req.uri[5:] form = FieldStorage(req) strict = form.get('strict', True) if strict in ['false', 'False', '0', None, '']: strict = False mt = form.get('mimeType', '') mt = mt.replace(' ', '+') if not mt: xtn = form.get('extension', '') if xtn: if xtn not in srlzHash: # can't continue raise ValueError(xtn) else: mt = srlzHash[xtn].mimeType if not mt: try: wanted = req.headers_in['Accept'] mts = conneg.parse(wanted) mt = conneg.best(mts, mimeList) except: mt = '' if mt: xtn = mimeHash[str(mt)] else: # default to rdf/xml xtn = "rdf.xml" srlz = srlzHash[xtn] if 'aggregation' in form: uri = form.get('aggregation') else: uri = path if not uri: data = '<html><body>Instructions etc. goes here</body></html>' self.send(data, req, ct="text/html") return elif not protoUriRe.match(uri): self.error("Resource Map URI must be a protocol based URI", req) return try: # fetch rd = ReMDocument(uri) except Exception as e: self.error( "Could not retrieve Resource Map from '%s': %s" % (uri, e.message), req) return try: # parse if rd.format == 'atom': parser = ap elif rd.format == 'rdfa': parser = rdfap else: parser = p if not strict: parser.strict = False try: rem = parser.parse(rd) parser.strict = True except: parser.strict = True raise except OreException as e: # get exception message self.error("Resource Map Invalid: %s" % e.message, req) return except SAXParseException as e: self.error( "Could not parse XML: %s (line %s, column %s)" % (e.getMessage(), e.getLineNumber(), e.getColumnNumber()), req) return except: raise try: # serialize rem2 = rem._aggregation_.register_serialization( srlz, 'http://foresite.cheshire3.org/%s#rem' % req.uri) rd = rem2.get_serialization() data = rd.data if srlz == srlzHash['rdfa.html']: data = '<xhtml xmlns="http://www.w3.org/1999/xhtml"><body><i>Invisible RDFa resource map follows, it must have validated okay. [view source] :)</i>' + data + "</body></xhtml>" except Exception as e: self.error( "Could not serialize Aggregation to Resource Map: %s" % e.message, req) return self.send(data, req, ct=srlz.mimeType)
srlzHash['old-atom.xml'].mimeType = "application/atom+xml;version=0.9" srlzHash['pretty.xml'].mimeType += ";format=pretty" p = RdfLibParser() p.strict = True ap = AtomParser() p.strict = True rdfap = RdfAParser() p.strict = True mimeHash = {} for (k, v) in list(srlzHash.items()): mimeHash[v.mimeType] = k mimestr = ', '.join(list(mimeHash.keys())) mimeList = conneg.parse(mimestr) protoUriRe = re.compile( "^([s]?http[s]?://|[t]?ftp:/|z39.50r:|gopher:|imap://|news:|nfs:|nntp:|rtsp:)" ) class validateHandler: def send(self, text, req, code=200, ct="text/xml"): req.content_type = ct req.content_length = len(text) req.send_http_header() if type(text) == str: req.write(text.encode('utf-8')) else: req.write(text)