def __init__(self, url=None, xml=None): parser = rdfadict.RdfaParser() if not xml: result = parser.parse_url(url) else: result = parser.parse_string(xml, url) data = result[url] self.metadata = self.get_properties(data)
def _load_source(self, url, subjects=None, sink=None, depth=2, redirects=None): barf_if_not_http(url) # bail out if we've hit the parsing limit if depth < 0: return sink if redirects is None: redirects = {} if subjects is None: subjects = [] parser = rdfadict.RdfaParser() try: # load the specified URL and parse the RDFa opener = urllib2.build_opener(TripleRedirectHandler(redirects)) request = urllib2.Request(url) request.add_header( 'User-Agent', 'CC Metadata Scaper http://wiki.creativecommons.org/Metadata_Scraper' ) response = opener.open(request) contents = response.read() # default to a set-based triple sink if sink is None: sink = TripleDictSink(redirects) triples = parser.parse_string(contents, url, sink) # look for possible predicates to follow if url in triples.keys() and url not in subjects: if url in redirects.keys(): subjects.append(redirects[url]) subjects.append(url) for s in triples.keys(): if s not in subjects: subjects.append(s) for p in triples[s].keys(): if p in FOLLOW_PREDICATES: # for each value of the predicate to follow for o in triples[s][p]: # follow if we haven't already looked here if o not in subjects: self._load_source(o, subjects, sink, depth - 1, redirects) except Exception, e: triples = {'_exception': str(e)}
def __init__(self): self.parser = rdfadict.RdfaParser() self.base = 'http://www.example.com/' self.lic = cc.license.by_code('by') self.fmtr = cc.license.formatters.HTML # define namespaces self.cc = rdflib.Namespace('http://creativecommons.org/ns#') self.dc = rdflib.Namespace('http://purl.org/dc/terms/') self.dc_type = rdflib.Namespace('http://purl.org/dc/dcmitype/') self.w3 = rdflib.Namespace('http://www.w3.org/1999/xhtml/vocab#') self.b = rdflib.Namespace(self.base)
def runTest(self): """Run the specified task force test.""" import rdfadict import rdfadict.sink.graph print self._uri # set up our target sink g = Graph() sink = rdfadict.sink.graph.GraphSink(g) # run rdfadict over the input parser = rdfadict.RdfaParser() parser.parseurl(self._source, sink) # execute the test SPARQL g.query(self._result) print g.selected
def schemafy(html_file): """Extract RDF from RDFa-annotated [html_file]; return a L{Graph} containing the RDF.""" # create an empty graph and bind some namespaces store = Graph() store.bind("cc", "http://creativecommons.org/ns#") store.bind("dc", "http://purl.org/dc/elements/1.1/") store.bind("dcq", "http://purl.org/dc/terms/") store.bind("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") store.bind("xsd", "http://www.w3.org/2001/XMLSchema-datatypes#") store.bind("owl", "http://www.w3.org/2002/07/owl#") store.bind("xhtml", "http://www.w3.org/1999/xhtml/vocab#") # parse the source document parser = rdfadict.RdfaParser() parser.parse_file(file(html_file), "http://creativecommons.org/ns", sink=GraphSink(store)) # remove undesirable assertions remove_assertions(store) return store
def __init__(self): self.lic = cc.license.by_code('by') self.parser = rdfadict.RdfaParser()