def command(self):
        ckanCatalogPath = "/tmp/ckan_catalogs.pickled"
        f = open(ckanCatalogPath, 'rU')
        ckanCatalogs = pickle.load(f)
        f.close()
        for catalog in ckanCatalogs:
            prefix = catalog['prefix'] #datagov
            ckanApiUrl = catalog['ckanApiUrl'] #http://catalog.data.gov/api
            packages = catalog['rdfpackages']
            for package in packages:
                rdfPackageName = package['name'] #name is a part of URI http://catalog.data.gov/dataset/name
                #just pickup first resource which is not None
                rdfResource = None
                for resource in package['resources']:
                    if(resource is not None):
                        rdfResource = resource
                        break
                rdfResourceUrl = rdfResource['url']
                rdfResourceFormat = rdfResource['format']

                rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.name==rdfPackageName).first()
                if(rdfdoc):
                    continue
                else:
                    newRdfdoc = model.RDFDoc(name=rdfPackageName, uri=rdfResourceUrl, format=rdfResourceFormat, ckan_catalog=prefix)
                    Session.add(newRdfdoc)
                    Session.commit()
 def callback_stats(self, rdfdocstat):
     no_of_statements = rdfdocstat.get_no_of_triples()
     if no_of_statements > 0:
         # update triples done
         if no_of_statements % 10000 == 0:
             self.stat_result.triples_done = no_of_statements
             self.stat_result.warnings = rdfdocstat.warnings
             Session.commit()
    def process_dataset(self, id):
        self.logging_file_config(config_file)
        log = logging.getLogger(__name__)

        self.worker_proc = None
        self.rdfdoc_to_do = None

        signal.signal(signal.SIGINT, self.term_handler)
        signal.signal(signal.SIGTERM, self.term_handler)

        rdfdoc_to_do = self._get_dataset(id)
        if rdfdoc_to_do is None:
            log.warning("rdfdoc_to_do is None")
            return 0

        # register this worker
        self.worker_proc = model.WorkerProc()
        self.worker_proc.pid = os.getpid()
        self.worker_proc.rdfdoc = rdfdoc_to_do
        Session.add(self.worker_proc)
        rdfdoc_to_do.worked_on = True
        self.rdfdoc_to_do = rdfdoc_to_do
        log.debug("worker %i working on %i" % (self.worker_proc.pid, self.rdfdoc_to_do.id))

        if rdfdoc_to_do.current_stats and rdfdoc_to_do.current_stats.errors == 'broken':
            rdfdoc_to_do.worked_on = False
            rdfdoc_to_do.last_updated = datetime.now()
            Session.delete(self.worker_proc)
            Session.commit()
            sys.exit(0)

        last_stat_result = rdfdoc_to_do.current_stats
        stat_result = model.StatResult()
        self.stat_result = stat_result
        rdfdoc_to_do.stats.append(stat_result)
        rdfdoc_to_do.current_stats = stat_result
        stat_result.triples_done = None
        stat_result.content_length = None
        stat_result.bytes_download = None
        stat_result.bytes = None
        stat_result.warnings = None
        stat_result.last_warning = None
        Session.commit()

        log.info(rdfdoc_to_do.format)

        error = None
        modified = True # set True if remote file has been modified
        try:
            rdfdocstats = RDFStats(rdfdoc_to_do.uri.encode('utf-8'), format=rdfdoc_to_do.format, stats=lodstats_stats)
            rdfdocstats.set_callback_function_download(self.callback_function_download)
            rdfdocstats.set_callback_function_extraction(self.callback_function_extraction)
            rdfdocstats.set_callback_function_statistics(self.callback_stats)
            rdfdocstats.start_statistics()
        except NotModified, errorstr:
            log.warning("not modified")
            modified = False
 def term_handler(self, signum, frame):
     log.debug("exiting through term handler")
     Session.rollback()
     if self.rdfdoc_to_do is None or self.rdfdoc_to_do.worked_on == False:
         if self.worker_proc != None:
             Session.delete(self.worker_proc)
             Session.commit()
         sys.exit(0)
     else:
         self.rdfdoc_to_do.worked_on = False
         Session.delete(self.stat_result)
         Session.delete(self.worker_proc)
         Session.commit()
         sys.exit(0)
Exemple #5
0
 def reset_current_stats_and_worker(self):
     if self.current_stats is not None:
         self.current_stats.prep_delete()
         Session.commit()
         Session.delete(self.current_stats)
         Session.commit()
     if self.worker is not None:
         Session.delete(self.worker)
     self.last_updated=None
     self.file_last_modified=None
     self.worked_on=False
     Session.commit()
Exemple #6
0
    def command(self):
        
        self.logging_file_config(config_file)
        log = logging.getLogger(__name__)
        
        self.worker_proc = None
        self.rdfdoc_to_do = None
        
        signal.signal(signal.SIGINT, self.term_handler)
        signal.signal(signal.SIGTERM, self.term_handler)
        
        # do not spawn more than two workers
        number_of_workers = Session.query(model.WorkerProc).with_lockmode('read').count()
        if number_of_workers >= 2:
            return 0

        # check for orphaned local packages
        allLocalPackages = Session.query(model.RDFDoc).all()
        for pkg in all_local_pkgs:
            if pkg.name not in package_list:
                log.debug("%s is gone and will be deleted" % pkg.name)
                #Session.delete(pkg)
                #Session.commit()
            
        for package_name in package_list:
            try:
                package = ckan.package_entity_get(package_name)
            except Exception, errorstr:
                log.debug("ERROR with %s: %s" % (package_name, errorstr))
                continue
            
            rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.name==package['name']).first()
            if rdfdoc is None:
                rdfdoc = model.RDFDoc()
                Session.add(rdfdoc)
                rdfdoc.name = package['name']
                        
            class BreakIt:
                pass

            try:    
                for resource in package['resources']:
                    if resource['format'].lower() in ["application/x-ntriples", "nt", "gzip:ntriples"]:
                        rdfdoc.format = "nt"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["application/x-nquads", "nquads"]:
                        rdfdoc.format = "nq"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["application/rdf+xml", "rdf"]:
                        rdfdoc.format = "rdf"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["text/turtle", "rdf/turtle", "ttl"]:
                        rdfdoc.format = "ttl"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["text/n3", "n3"]:
                        rdfdoc.format = "n3"
                        rdfdoc.uri = resource['url']
                        raise BreakIt
                for resource in package['resources']:
                    if resource['format'].lower() in ["api/sparql", "sparql"]:
                        # prefer a sitemap.xml over sparql, if any
                        for sitemap_resource in package['resources']:
                            if sitemap_resource['format'].lower() in ["meta/sitemap"]:
                                rdfdoc.format = "sitemap"
                                rdfdoc.uri = sitemap_resource['url']
                                raise BreakIt
                        rdfdoc.format = "sparql"
                        rdfdoc.uri = resource['url']
            except BreakIt:
                pass
            if rdfdoc.format is not None:
                Session.commit()
            else:
                Session.rollback()
 def callback_function_extraction(self, rdfdocstat):
     self.stat_result.bytes = rdfdocstat.bytes_extracted
     Session.commit()
 def callback_function_download(self, rdfdocstat):
     self.stat_result.content_length = rdfdocstat.content_length
     self.stat_result.bytes_downloaded = rdfdocstat.bytes_downloaded
     Session.commit()
            for link_uri,result in namespacelinks_ordered.iteritems():
                c = Session.query(model.Link).filter(model.Link.code==link_uri).first()
                if c is None:
                    c = model.Link()
                    c.code = link_uri
                    Session.add(c)
                rcs = model.LinkStat()
                rcs.link = c
                rcs.stat_result = stat_result
                rcs.count = result
                Session.add(rcs)
                nsl_count += 1
                if nsl_count >= 500:
                    break
        elif not modified:
            rdfdoc_to_do.current_stats = last_stat_result
            Session.delete(stat_result)
        else:
            stat_result.triples = None
            stat_result.void = None
            stat_result.has_errors = True
            stat_result.errors = unicode(error)

        rdfdoc_to_do.worked_on = False
        rdfdoc_to_do.last_updated = datetime.now()
        rdfdoc_to_do.file_last_modified = rdfdocstats.last_modified
        stat_result.last_updated = datetime.now()
        Session.delete(self.worker_proc)
        Session.commit()
	log.debug("Done!")