Esempio n. 1
0
 def __init__(self, url=None, xml=None):
     parser = rdfadict.RdfaParser()
     if not xml:
         result = parser.parse_url(url)
     else:
         result = parser.parse_string(xml, url)
     data = result[url]
     self.metadata = self.get_properties(data)
Esempio n. 2
0
    def _load_source(self,
                     url,
                     subjects=None,
                     sink=None,
                     depth=2,
                     redirects=None):
        barf_if_not_http(url)

        # bail out if we've hit the parsing limit
        if depth < 0:
            return sink

        if redirects is None:
            redirects = {}

        if subjects is None:
            subjects = []

        parser = rdfadict.RdfaParser()

        try:
            # load the specified URL and parse the RDFa
            opener = urllib2.build_opener(TripleRedirectHandler(redirects))
            request = urllib2.Request(url)
            request.add_header(
                'User-Agent',
                'CC Metadata Scaper http://wiki.creativecommons.org/Metadata_Scraper'
            )
            response = opener.open(request)
            contents = response.read()

            # default to a set-based triple sink
            if sink is None:
                sink = TripleDictSink(redirects)

            triples = parser.parse_string(contents, url, sink)

            # look for possible predicates to follow
            if url in triples.keys() and url not in subjects:
                if url in redirects.keys():
                    subjects.append(redirects[url])
                    subjects.append(url)

            for s in triples.keys():
                if s not in subjects:
                    subjects.append(s)
                for p in triples[s].keys():
                    if p in FOLLOW_PREDICATES:
                        # for each value of the predicate to follow
                        for o in triples[s][p]:
                            # follow if we haven't already looked here
                            if o not in subjects:
                                self._load_source(o, subjects, sink, depth - 1,
                                                  redirects)

        except Exception, e:
            triples = {'_exception': str(e)}
Esempio n. 3
0
 def __init__(self):
     self.parser = rdfadict.RdfaParser()
     self.base = 'http://www.example.com/'
     self.lic = cc.license.by_code('by')
     self.fmtr = cc.license.formatters.HTML
     # define namespaces
     self.cc = rdflib.Namespace('http://creativecommons.org/ns#')
     self.dc = rdflib.Namespace('http://purl.org/dc/terms/')
     self.dc_type = rdflib.Namespace('http://purl.org/dc/dcmitype/')
     self.w3 = rdflib.Namespace('http://www.w3.org/1999/xhtml/vocab#')
     self.b = rdflib.Namespace(self.base)
Esempio n. 4
0
    def runTest(self):
        """Run the specified task force test."""
        import rdfadict
        import rdfadict.sink.graph

        print self._uri

        # set up our target sink
        g = Graph()
        sink = rdfadict.sink.graph.GraphSink(g)

        # run rdfadict over the input
        parser = rdfadict.RdfaParser()
        parser.parseurl(self._source, sink)

        # execute the test SPARQL
        g.query(self._result)

        print g.selected
Esempio n. 5
0
def schemafy(html_file):
    """Extract RDF from RDFa-annotated [html_file]; return a L{Graph} 
    containing the RDF."""

    # create an empty graph and bind some namespaces
    store = Graph()
    store.bind("cc", "http://creativecommons.org/ns#")
    store.bind("dc", "http://purl.org/dc/elements/1.1/")
    store.bind("dcq", "http://purl.org/dc/terms/")
    store.bind("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    store.bind("xsd", "http://www.w3.org/2001/XMLSchema-datatypes#")
    store.bind("owl", "http://www.w3.org/2002/07/owl#")
    store.bind("xhtml", "http://www.w3.org/1999/xhtml/vocab#")

    # parse the source document
    parser = rdfadict.RdfaParser()
    parser.parse_file(file(html_file),
                      "http://creativecommons.org/ns",
                      sink=GraphSink(store))

    # remove undesirable assertions
    remove_assertions(store)

    return store
Esempio n. 6
0
 def __init__(self):
     self.lic = cc.license.by_code('by')
     self.parser = rdfadict.RdfaParser()