def process_dataset(self, id): self.logging_file_config(config_file) log = logging.getLogger(__name__) self.worker_proc = None self.rdfdoc_to_do = None signal.signal(signal.SIGINT, self.term_handler) signal.signal(signal.SIGTERM, self.term_handler) rdfdoc_to_do = self._get_dataset(id) if rdfdoc_to_do is None: log.warning("rdfdoc_to_do is None") return 0 # register this worker self.worker_proc = model.WorkerProc() self.worker_proc.pid = os.getpid() self.worker_proc.rdfdoc = rdfdoc_to_do Session.add(self.worker_proc) rdfdoc_to_do.worked_on = True self.rdfdoc_to_do = rdfdoc_to_do log.debug("worker %i working on %i" % (self.worker_proc.pid, self.rdfdoc_to_do.id)) if rdfdoc_to_do.current_stats and rdfdoc_to_do.current_stats.errors == 'broken': rdfdoc_to_do.worked_on = False rdfdoc_to_do.last_updated = datetime.now() Session.delete(self.worker_proc) Session.commit() sys.exit(0) last_stat_result = rdfdoc_to_do.current_stats stat_result = model.StatResult() self.stat_result = stat_result rdfdoc_to_do.stats.append(stat_result) rdfdoc_to_do.current_stats = stat_result stat_result.triples_done = None stat_result.content_length = None stat_result.bytes_download = None stat_result.bytes = None stat_result.warnings = None stat_result.last_warning = None Session.commit() log.info(rdfdoc_to_do.format) error = None modified = True # set True if remote file has been modified try: rdfdocstats = RDFStats(rdfdoc_to_do.uri.encode('utf-8'), format=rdfdoc_to_do.format, stats=lodstats_stats) rdfdocstats.set_callback_function_download(self.callback_function_download) rdfdocstats.set_callback_function_extraction(self.callback_function_extraction) rdfdocstats.set_callback_function_statistics(self.callback_stats) rdfdocstats.start_statistics() except NotModified, errorstr: log.warning("not modified") modified = False
def reset_current_stats_and_worker(self): if self.current_stats is not None: self.current_stats.prep_delete() Session.commit() Session.delete(self.current_stats) Session.commit() if self.worker is not None: Session.delete(self.worker) self.last_updated=None self.file_last_modified=None self.worked_on=False Session.commit()
def term_handler(self, signum, frame): log.debug("exiting through term handler") Session.rollback() if self.rdfdoc_to_do is None or self.rdfdoc_to_do.worked_on == False: if self.worker_proc != None: Session.delete(self.worker_proc) Session.commit() sys.exit(0) else: self.rdfdoc_to_do.worked_on = False Session.delete(self.stat_result) Session.delete(self.worker_proc) Session.commit() sys.exit(0)
def prep_delete(self): for c in self.rdf_class_assocs: Session.delete(c) for v in self.vocab_assocs: Session.delete(v) for p in self.rdf_property_assocs: Session.delete(p) for dc in self.defined_class_assocs: Session.delete(dc) for rd in self.rdf_datatype_assocs: Session.delete(rd) for l in self.language_assocs: Session.delete(l) for li in self.link_assocs: Session.delete(li)
for link_uri,result in namespacelinks_ordered.iteritems(): c = Session.query(model.Link).filter(model.Link.code==link_uri).first() if c is None: c = model.Link() c.code = link_uri Session.add(c) rcs = model.LinkStat() rcs.link = c rcs.stat_result = stat_result rcs.count = result Session.add(rcs) nsl_count += 1 if nsl_count >= 500: break elif not modified: rdfdoc_to_do.current_stats = last_stat_result Session.delete(stat_result) else: stat_result.triples = None stat_result.void = None stat_result.has_errors = True stat_result.errors = unicode(error) rdfdoc_to_do.worked_on = False rdfdoc_to_do.last_updated = datetime.now() rdfdoc_to_do.file_last_modified = rdfdocstats.last_modified stat_result.last_updated = datetime.now() Session.delete(self.worker_proc) Session.commit() log.debug("Done!")