def process_dataset(self, id):
        self.logging_file_config(config_file)
        log = logging.getLogger(__name__)

        self.worker_proc = None
        self.rdfdoc_to_do = None

        signal.signal(signal.SIGINT, self.term_handler)
        signal.signal(signal.SIGTERM, self.term_handler)

        rdfdoc_to_do = self._get_dataset(id)
        if rdfdoc_to_do is None:
            log.warning("rdfdoc_to_do is None")
            return 0

        # register this worker
        self.worker_proc = model.WorkerProc()
        self.worker_proc.pid = os.getpid()
        self.worker_proc.rdfdoc = rdfdoc_to_do
        Session.add(self.worker_proc)
        rdfdoc_to_do.worked_on = True
        self.rdfdoc_to_do = rdfdoc_to_do
        log.debug("worker %i working on %i" % (self.worker_proc.pid, self.rdfdoc_to_do.id))

        if rdfdoc_to_do.current_stats and rdfdoc_to_do.current_stats.errors == 'broken':
            rdfdoc_to_do.worked_on = False
            rdfdoc_to_do.last_updated = datetime.now()
            Session.delete(self.worker_proc)
            Session.commit()
            sys.exit(0)

        last_stat_result = rdfdoc_to_do.current_stats
        stat_result = model.StatResult()
        self.stat_result = stat_result
        rdfdoc_to_do.stats.append(stat_result)
        rdfdoc_to_do.current_stats = stat_result
        stat_result.triples_done = None
        stat_result.content_length = None
        stat_result.bytes_download = None
        stat_result.bytes = None
        stat_result.warnings = None
        stat_result.last_warning = None
        Session.commit()

        log.info(rdfdoc_to_do.format)

        error = None
        modified = True # set True if remote file has been modified
        try:
            rdfdocstats = RDFStats(rdfdoc_to_do.uri.encode('utf-8'), format=rdfdoc_to_do.format, stats=lodstats_stats)
            rdfdocstats.set_callback_function_download(self.callback_function_download)
            rdfdocstats.set_callback_function_extraction(self.callback_function_extraction)
            rdfdocstats.set_callback_function_statistics(self.callback_stats)
            rdfdocstats.start_statistics()
        except NotModified, errorstr:
            log.warning("not modified")
            modified = False
Exemple #2
0
 def reset_current_stats_and_worker(self):
     if self.current_stats is not None:
         self.current_stats.prep_delete()
         Session.commit()
         Session.delete(self.current_stats)
         Session.commit()
     if self.worker is not None:
         Session.delete(self.worker)
     self.last_updated=None
     self.file_last_modified=None
     self.worked_on=False
     Session.commit()
 def term_handler(self, signum, frame):
     log.debug("exiting through term handler")
     Session.rollback()
     if self.rdfdoc_to_do is None or self.rdfdoc_to_do.worked_on == False:
         if self.worker_proc != None:
             Session.delete(self.worker_proc)
             Session.commit()
         sys.exit(0)
     else:
         self.rdfdoc_to_do.worked_on = False
         Session.delete(self.stat_result)
         Session.delete(self.worker_proc)
         Session.commit()
         sys.exit(0)
Exemple #4
0
 def prep_delete(self):
     for c in self.rdf_class_assocs:
         Session.delete(c)
     for v in self.vocab_assocs:
         Session.delete(v)
     for p in self.rdf_property_assocs:
         Session.delete(p)
     for dc in self.defined_class_assocs:
         Session.delete(dc)
     for rd in self.rdf_datatype_assocs:
         Session.delete(rd)
     for l in self.language_assocs:
         Session.delete(l)
     for li in self.link_assocs:
         Session.delete(li)
            for link_uri,result in namespacelinks_ordered.iteritems():
                c = Session.query(model.Link).filter(model.Link.code==link_uri).first()
                if c is None:
                    c = model.Link()
                    c.code = link_uri
                    Session.add(c)
                rcs = model.LinkStat()
                rcs.link = c
                rcs.stat_result = stat_result
                rcs.count = result
                Session.add(rcs)
                nsl_count += 1
                if nsl_count >= 500:
                    break
        elif not modified:
            rdfdoc_to_do.current_stats = last_stat_result
            Session.delete(stat_result)
        else:
            stat_result.triples = None
            stat_result.void = None
            stat_result.has_errors = True
            stat_result.errors = unicode(error)

        rdfdoc_to_do.worked_on = False
        rdfdoc_to_do.last_updated = datetime.now()
        rdfdoc_to_do.file_last_modified = rdfdocstats.last_modified
        stat_result.last_updated = datetime.now()
        Session.delete(self.worker_proc)
        Session.commit()
	log.debug("Done!")