Ejemplo n.º 1
0
    def command(self):
        ckanCatalogPath = "/tmp/ckan_catalogs.pickled"
        f = open(ckanCatalogPath, 'rU')
        ckanCatalogs = pickle.load(f)
        f.close()
        for catalog in ckanCatalogs:
            prefix = catalog['prefix'] #datagov
            ckanApiUrl = catalog['ckanApiUrl'] #http://catalog.data.gov/api
            packages = catalog['rdfpackages']
            for package in packages:
                rdfPackageName = package['name'] #name is a part of URI http://catalog.data.gov/dataset/name
                #just pickup first resource which is not None
                rdfResource = None
                for resource in package['resources']:
                    if(resource is not None):
                        rdfResource = resource
                        break
                rdfResourceUrl = rdfResource['url']
                rdfResourceFormat = rdfResource['format']

                rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.name==rdfPackageName).first()
                if(rdfdoc):
                    continue
                else:
                    newRdfdoc = model.RDFDoc(name=rdfPackageName, uri=rdfResourceUrl, format=rdfResourceFormat, ckan_catalog=prefix)
                    Session.add(newRdfdoc)
                    Session.commit()
Ejemplo n.º 2
0
 def callback_stats(self, rdfdocstat):
     no_of_statements = rdfdocstat.get_no_of_triples()
     if no_of_statements > 0:
         # update triples done
         if no_of_statements % 10000 == 0:
             self.stat_result.triples_done = no_of_statements
             self.stat_result.warnings = rdfdocstat.warnings
             Session.commit()
Ejemplo n.º 3
0
 def reset_current_stats_and_worker(self):
     if self.current_stats is not None:
         self.current_stats.prep_delete()
         Session.commit()
         Session.delete(self.current_stats)
         Session.commit()
     if self.worker is not None:
         Session.delete(self.worker)
     self.last_updated=None
     self.file_last_modified=None
     self.worked_on=False
     Session.commit()
Ejemplo n.º 4
0
    def command(self):
        print("Loading RDF database...")
        rdfStorage = RDF.FileStorage("lodstatsrdf")
        rdfModel = RDF.Model(rdfStorage)
        print("Finished!")

        ckanCatalogPath = "/tmp/ckan_catalogs.pickled"
        print("Reading " + ckanCatalogPath)
        f = open(ckanCatalogPath, 'rU')
        ckanCatalogs = pickle.load(f)
        f.close()
        print("Finished!")
        #Fetch the all rdfdocs from the DB
        print("Fetching the data from DB...")
        rdfdocs = Session.query(model.RDFDoc).all()
        print("Fetched the data from DB!")
        overall = len(rdfdocs)
        for num, rdfdoc in enumerate(rdfdocs):
            print("Processing %d out of %d" % (num, overall))
            try:
                self.generateRdfForRdfDoc(rdfdoc, rdfModel, ckanCatalogs)
            except BaseException as e:
                print("Oops, exception occured: "+str(e))

        serializer = RDF.Serializer(name="ntriples")
        serializer.serialize_model_to_file("lodstats.nt", rdfModel, base_uri=None)
Ejemplo n.º 5
0
    def process_dataset(self, id):
        self.logging_file_config(config_file)
        log = logging.getLogger(__name__)

        self.worker_proc = None
        self.rdfdoc_to_do = None

        signal.signal(signal.SIGINT, self.term_handler)
        signal.signal(signal.SIGTERM, self.term_handler)

        rdfdoc_to_do = self._get_dataset(id)
        if rdfdoc_to_do is None:
            log.warning("rdfdoc_to_do is None")
            return 0

        # register this worker
        self.worker_proc = model.WorkerProc()
        self.worker_proc.pid = os.getpid()
        self.worker_proc.rdfdoc = rdfdoc_to_do
        Session.add(self.worker_proc)
        rdfdoc_to_do.worked_on = True
        self.rdfdoc_to_do = rdfdoc_to_do
        log.debug("worker %i working on %i" % (self.worker_proc.pid, self.rdfdoc_to_do.id))

        if rdfdoc_to_do.current_stats and rdfdoc_to_do.current_stats.errors == 'broken':
            rdfdoc_to_do.worked_on = False
            rdfdoc_to_do.last_updated = datetime.now()
            Session.delete(self.worker_proc)
            Session.commit()
            sys.exit(0)

        last_stat_result = rdfdoc_to_do.current_stats
        stat_result = model.StatResult()
        self.stat_result = stat_result
        rdfdoc_to_do.stats.append(stat_result)
        rdfdoc_to_do.current_stats = stat_result
        stat_result.triples_done = None
        stat_result.content_length = None
        stat_result.bytes_download = None
        stat_result.bytes = None
        stat_result.warnings = None
        stat_result.last_warning = None
        Session.commit()

        log.info(rdfdoc_to_do.format)

        error = None
        modified = True # set True if remote file has been modified
        try:
            rdfdocstats = RDFStats(rdfdoc_to_do.uri.encode('utf-8'), format=rdfdoc_to_do.format, stats=lodstats_stats)
            rdfdocstats.set_callback_function_download(self.callback_function_download)
            rdfdocstats.set_callback_function_extraction(self.callback_function_extraction)
            rdfdocstats.set_callback_function_statistics(self.callback_stats)
            rdfdocstats.start_statistics()
        except NotModified, errorstr:
            log.warning("not modified")
            modified = False
Ejemplo n.º 6
0
 def command(self):
     id = self.args[0]
     exchange = "lodstats_datasets_exchange"
     queue = "lodstats_datasets_queue"
     message_broker = Messaging()
     message_broker.declareDirectExchange(exchange)
     message_broker.declareQueue(queue)
     message_broker.bindExchangeToQueue(exchange, queue)
     rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.active==True, model.RDFDoc.id==id).one()
     dataset = {
             'id': rdfdoc.id,
             }
     message = json.dumps(dataset)
     message_broker.sendMessageToQueue(queue, message)
Ejemplo n.º 7
0
 def getVoid(self):
     #Join on rdfdoc here (!) replace uri of the dataset with the http://lodstats.aksw.org/stat_result/6702.void
     statResults = Session.query(model.StatResult, model.RDFDoc).\
                           filter(model.StatResult.rdfdoc_id==model.RDFDoc.id).\
                           all()
     void = [];
     for statResult, rdfdoc in statResults:
         if(statResult.void is not None):
             try:
                 statResultUri = "http://lodstats.aksw.org/stat_result/"+str(statResult.id)+".void"
                 if(re.search("<http://stats.lod2.eu/rdf/void/.source.*>" , statResult.void)):
                     replacedVoid = re.sub("<http...stats.lod2.eu.rdf.void..source.*>", "<"+statResultUri+">", statResult.void)
                     f = codecs.open("./void/"+str(statResult.id)+".ttl", 'w', 'utf-8')
                     f.write(replacedVoid)
                     f.close()
                 else:
                     g=rdflib.Graph()
                     g.parse(data=statResult.void, format='turtle')
                     result = g.update("""
                             INSERT 
                                {<"""+statResultUri+"""> ?p ?o} 
                             WHERE 
                                {
                                  ?s ?p ?o . 
                                  ?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://rdfs.org/ns/void#dataset> 
                                }
                             """)
                     g.commit()
                     result = g.query("""
                             SELECT 
                                DISTINCT ?s ?p ?o 
                             WHERE {
                               ?s ?p ?o . 
                               ?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://rdfs.org/ns/void#dataset> . 
                               FILTER(?s != <"""+statResultUri+""">)
                               }
                             """)
                     subjToDelete = "";
                     for res in result: subjToDelete = res.s; break;
                     g.remove((subjToDelete, None, None))
                     g.commit()
                     f = codecs.open("./void/"+str(statResult.id)+".nt", 'w', 'utf-8')
                     f.write(g.serialize(format="nt"))
                     f.close()
             except BaseException as e:
                 print str(e)
Ejemplo n.º 8
0
    def command(self):
        
        self.logging_file_config(config_file)
        log = logging.getLogger(__name__)
        
        self.worker_proc = None
        self.rdfdoc_to_do = None
        
        signal.signal(signal.SIGINT, self.term_handler)
        signal.signal(signal.SIGTERM, self.term_handler)
        
        # do not spawn more than two workers
        number_of_workers = Session.query(model.WorkerProc).with_lockmode('read').count()
        if number_of_workers >= 2:
            return 0

        # check for orphaned local packages
        allLocalPackages = Session.query(model.RDFDoc).all()
        for pkg in all_local_pkgs:
            if pkg.name not in package_list:
                log.debug("%s is gone and will be deleted" % pkg.name)
                #Session.delete(pkg)
                #Session.commit()
            
        for package_name in package_list:
            try:
                package = ckan.package_entity_get(package_name)
            except Exception, errorstr:
                log.debug("ERROR with %s: %s" % (package_name, errorstr))
                continue
            
            rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.name==package['name']).first()
            if rdfdoc is None:
                rdfdoc = model.RDFDoc()
                Session.add(rdfdoc)
                rdfdoc.name = package['name']
                        
            class BreakIt:
                pass

            try:    
                for resource in package['resources']:
                    if resource['format'].lower() in ["application/x-ntriples", "nt", "gzip:ntriples"]:
                        rdfdoc.format = "nt"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["application/x-nquads", "nquads"]:
                        rdfdoc.format = "nq"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["application/rdf+xml", "rdf"]:
                        rdfdoc.format = "rdf"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["text/turtle", "rdf/turtle", "ttl"]:
                        rdfdoc.format = "ttl"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["text/n3", "n3"]:
                        rdfdoc.format = "n3"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["api/sparql", "sparql"]:
                        # prefer a sitemap.xml over sparql, if any
                        for sitemap_resource in package['resources']:
                            if sitemap_resource['format'].lower() in ["meta/sitemap"]:
                                rdfdoc.format = "sitemap"
                                rdfdoc.uri = sitemap_resource['url']
                                raise BreakIt
                        rdfdoc.format = "sparql"
                        rdfdoc.uri = resource['url']
            except BreakIt:
                pass
            if rdfdoc.format is not None:
                Session.commit()
            else:
                Session.rollback()
Ejemplo n.º 9
0
 def term_handler(self, signum, frame):
     Session.rollback()
     sys.exit(0)
Ejemplo n.º 10
0
 def prep_delete(self):
     for c in self.rdf_class_assocs:
         Session.delete(c)
     for v in self.vocab_assocs:
         Session.delete(v)
     for p in self.rdf_property_assocs:
         Session.delete(p)
     for dc in self.defined_class_assocs:
         Session.delete(dc)
     for rd in self.rdf_datatype_assocs:
         Session.delete(rd)
     for l in self.language_assocs:
         Session.delete(l)
     for li in self.link_assocs:
         Session.delete(li)
Ejemplo n.º 11
0
def init_model(engine):
    """Call me before using any of the tables or classes in the model"""
    Session.configure(bind=engine)
Ejemplo n.º 12
0
 def term_handler(self, signum, frame):
     log.debug("exiting through term handler")
     Session.rollback()
     if self.rdfdoc_to_do is None or self.rdfdoc_to_do.worked_on == False:
         if self.worker_proc != None:
             Session.delete(self.worker_proc)
             Session.commit()
         sys.exit(0)
     else:
         self.rdfdoc_to_do.worked_on = False
         Session.delete(self.stat_result)
         Session.delete(self.worker_proc)
         Session.commit()
         sys.exit(0)
Ejemplo n.º 13
0
 def callback_function_extraction(self, rdfdocstat):
     self.stat_result.bytes = rdfdocstat.bytes_extracted
     Session.commit()
Ejemplo n.º 14
0
 def callback_function_download(self, rdfdocstat):
     self.stat_result.content_length = rdfdocstat.content_length
     self.stat_result.bytes_downloaded = rdfdocstat.bytes_downloaded
     Session.commit()
Ejemplo n.º 15
0
 def _get_dataset(self, id):
     return Session.query(model.RDFDoc).filter(model.RDFDoc.id==id).first()
Ejemplo n.º 16
0
        except Exception, errorstr:
            log.error(errorstr)
            error = errorstr

        if error is None and (modified or rdfdoc_to_do.current_stats is None):
            stat_result.triples = rdfdocstats.get_no_of_triples()
            stat_result.void = rdfdocstats.voidify('turtle')
            stat_result.warnings = rdfdocstats.get_no_of_warnings()
            if stat_result.warnings > 0:
		last_warning = rdfdocstats.get_last_warning()
                stat_result.last_warning = unicode(last_warning.message, errors='replace')
            stat_result.has_errors = False
            stat_result.errors = None
            stats_results = rdfdocstats.get_stats_results()
            for class_uri,usage_count in stats_results['usedclasses']['usage_count'].iteritems():
                c = Session.query(model.RDFClass).filter(model.RDFClass.uri==class_uri).first()
                if c is None:
                    c = model.RDFClass()
                    c.uri = class_uri
                    Session.add(c)
                rcs = model.RDFClassStat()
                rcs.rdf_class = c
                rcs.stat_result = stat_result
                rcs.count = usage_count
                Session.add(rcs)
            # vocab:
            for base_uri,result in stats_results['vocabularies'].iteritems():
                if result > 0:
                    v = Session.query(model.Vocab).filter(model.Vocab.uri==base_uri).first()
                    if v is None:
                        v = model.Vocab()