Beispiel #1
0
    def command(self):
        ckanCatalogPath = "/tmp/ckan_catalogs.pickled"
        f = open(ckanCatalogPath, 'rU')
        ckanCatalogs = pickle.load(f)
        f.close()
        for catalog in ckanCatalogs:
            prefix = catalog['prefix'] #datagov
            ckanApiUrl = catalog['ckanApiUrl'] #http://catalog.data.gov/api
            packages = catalog['rdfpackages']
            for package in packages:
                rdfPackageName = package['name'] #name is a part of URI http://catalog.data.gov/dataset/name
                #just pickup first resource which is not None
                rdfResource = None
                for resource in package['resources']:
                    if(resource is not None):
                        rdfResource = resource
                        break
                rdfResourceUrl = rdfResource['url']
                rdfResourceFormat = rdfResource['format']

                rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.name==rdfPackageName).first()
                if(rdfdoc):
                    continue
                else:
                    newRdfdoc = model.RDFDoc(name=rdfPackageName, uri=rdfResourceUrl, format=rdfResourceFormat, ckan_catalog=prefix)
                    Session.add(newRdfdoc)
                    Session.commit()
Beispiel #2
0
    def process_dataset(self, id):
        self.logging_file_config(config_file)
        log = logging.getLogger(__name__)

        self.worker_proc = None
        self.rdfdoc_to_do = None

        signal.signal(signal.SIGINT, self.term_handler)
        signal.signal(signal.SIGTERM, self.term_handler)

        rdfdoc_to_do = self._get_dataset(id)
        if rdfdoc_to_do is None:
            log.warning("rdfdoc_to_do is None")
            return 0

        # register this worker
        self.worker_proc = model.WorkerProc()
        self.worker_proc.pid = os.getpid()
        self.worker_proc.rdfdoc = rdfdoc_to_do
        Session.add(self.worker_proc)
        rdfdoc_to_do.worked_on = True
        self.rdfdoc_to_do = rdfdoc_to_do
        log.debug("worker %i working on %i" % (self.worker_proc.pid, self.rdfdoc_to_do.id))

        if rdfdoc_to_do.current_stats and rdfdoc_to_do.current_stats.errors == 'broken':
            rdfdoc_to_do.worked_on = False
            rdfdoc_to_do.last_updated = datetime.now()
            Session.delete(self.worker_proc)
            Session.commit()
            sys.exit(0)

        last_stat_result = rdfdoc_to_do.current_stats
        stat_result = model.StatResult()
        self.stat_result = stat_result
        rdfdoc_to_do.stats.append(stat_result)
        rdfdoc_to_do.current_stats = stat_result
        stat_result.triples_done = None
        stat_result.content_length = None
        stat_result.bytes_download = None
        stat_result.bytes = None
        stat_result.warnings = None
        stat_result.last_warning = None
        Session.commit()

        log.info(rdfdoc_to_do.format)

        error = None
        modified = True # set True if remote file has been modified
        try:
            rdfdocstats = RDFStats(rdfdoc_to_do.uri.encode('utf-8'), format=rdfdoc_to_do.format, stats=lodstats_stats)
            rdfdocstats.set_callback_function_download(self.callback_function_download)
            rdfdocstats.set_callback_function_extraction(self.callback_function_extraction)
            rdfdocstats.set_callback_function_statistics(self.callback_stats)
            rdfdocstats.start_statistics()
        except NotModified, errorstr:
            log.warning("not modified")
            modified = False
Beispiel #3
0
    def command(self):
        
        self.logging_file_config(config_file)
        log = logging.getLogger(__name__)
        
        self.worker_proc = None
        self.rdfdoc_to_do = None
        
        signal.signal(signal.SIGINT, self.term_handler)
        signal.signal(signal.SIGTERM, self.term_handler)
        
        # do not spawn more than two workers
        number_of_workers = Session.query(model.WorkerProc).with_lockmode('read').count()
        if number_of_workers >= 2:
            return 0

        # check for orphaned local packages
        allLocalPackages = Session.query(model.RDFDoc).all()
        for pkg in all_local_pkgs:
            if pkg.name not in package_list:
                log.debug("%s is gone and will be deleted" % pkg.name)
                #Session.delete(pkg)
                #Session.commit()
            
        for package_name in package_list:
            try:
                package = ckan.package_entity_get(package_name)
            except Exception, errorstr:
                log.debug("ERROR with %s: %s" % (package_name, errorstr))
                continue
            
            rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.name==package['name']).first()
            if rdfdoc is None:
                rdfdoc = model.RDFDoc()
                Session.add(rdfdoc)
                rdfdoc.name = package['name']
                        
            class BreakIt:
                pass

            try:    
                for resource in package['resources']:
                    if resource['format'].lower() in ["application/x-ntriples", "nt", "gzip:ntriples"]:
                        rdfdoc.format = "nt"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["application/x-nquads", "nquads"]:
                        rdfdoc.format = "nq"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["application/rdf+xml", "rdf"]:
                        rdfdoc.format = "rdf"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["text/turtle", "rdf/turtle", "ttl"]:
                        rdfdoc.format = "ttl"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["text/n3", "n3"]:
                        rdfdoc.format = "n3"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["api/sparql", "sparql"]:
                        # prefer a sitemap.xml over sparql, if any
                        for sitemap_resource in package['resources']:
                            if sitemap_resource['format'].lower() in ["meta/sitemap"]:
                                rdfdoc.format = "sitemap"
                                rdfdoc.uri = sitemap_resource['url']
                                raise BreakIt
                        rdfdoc.format = "sparql"
                        rdfdoc.uri = resource['url']
            except BreakIt:
                pass
            if rdfdoc.format is not None:
                Session.commit()
            else:
                Session.rollback()
Beispiel #4
0
        if error is None and (modified or rdfdoc_to_do.current_stats is None):
            stat_result.triples = rdfdocstats.get_no_of_triples()
            stat_result.void = rdfdocstats.voidify('turtle')
            stat_result.warnings = rdfdocstats.get_no_of_warnings()
            if stat_result.warnings > 0:
		last_warning = rdfdocstats.get_last_warning()
                stat_result.last_warning = unicode(last_warning.message, errors='replace')
            stat_result.has_errors = False
            stat_result.errors = None
            stats_results = rdfdocstats.get_stats_results()
            for class_uri,usage_count in stats_results['usedclasses']['usage_count'].iteritems():
                c = Session.query(model.RDFClass).filter(model.RDFClass.uri==class_uri).first()
                if c is None:
                    c = model.RDFClass()
                    c.uri = class_uri
                    Session.add(c)
                rcs = model.RDFClassStat()
                rcs.rdf_class = c
                rcs.stat_result = stat_result
                rcs.count = usage_count
                Session.add(rcs)
            # vocab:
            for base_uri,result in stats_results['vocabularies'].iteritems():
                if result > 0:
                    v = Session.query(model.Vocab).filter(model.Vocab.uri==base_uri).first()
                    if v is None:
                        v = model.Vocab()
                        v.uri = base_uri
                        Session.add(v)
                    rvs = model.RDFVocabStat()
                    rvs.vocab = v