def __call__(self, chunk: TransformationChunk): self._tasks = chunk.tasks self._context = chunk.context # io_time = 0 working_time = 0 log_step = max(1, int(len(self._tasks) / 20.0)) for index, task in enumerate(self._tasks): if index % log_step == 0: logger.info( " {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( index, len(self._tasks), io_time, working_time)) io_start = time.time() entity = read_json(task.in_path) io_time += time.time() - io_start work_start = time.time() self._prune_hierarchy(entity) working_time += time.time() - work_start io_start = time.time() write_json(task.out_path, entity) io_time += time.time() - io_start logger.info(" {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( len(self._tasks), len(self._tasks), io_time, working_time))
def _save_hierarchy(self, hierarchy): logger.info("Saving hierarchy ...") io_time = 0 working_time = 0 log_step = max(1, int(len(self._tasks) / 20.0)) for index, task in enumerate(self._tasks): if index % int(len(self._tasks) / log_step) == 0: logger.info( " {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( index, len(self._tasks), io_time, working_time)) io_start = time.time() entity = read_json(task.in_path) io_time += time.time() - io_start add_time = time.time() self._add_hierarchy(entity, hierarchy) working_time += time.time() - add_time io_start = time.time() write_json(task.out_path, entity) io_time += time.time() - io_start logger.info(" {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( len(self._tasks), len(self._tasks), io_time, working_time))
def _write_terms_mapping(self, terms_to_entities: WikidataEntityMap): log_progress_step = max(1, int(len(self._tasks) / 20.0)) io_time = 0 working_time = 0 for index, task in enumerate(self._tasks): if index % log_progress_step == 0: logger.info( " {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( index, len(self._tasks), io_time, working_time)) io_time_start = time.time() entity = read_json(task.in_path) io_time += time.time() - io_time_start working_time_start = time.time() for selector in self._context.mapping_selector(entity): self._add_mappings_for_selector(selector, terms_to_entities) working_time += time.time() - working_time_start io_time_start = time.time() write_json(task.out_path, entity) io_time += time.time() - io_time_start logger.info(" {:>5} / {} io: {:0.0f}s working: {:0.0f}s".format( len(self._tasks), len(self._tasks), io_time, working_time))
def __call__(self, input_dir: str, output_dir: str): with TransformationContext(output_dir) as context: for new_content in self._iterate_input_file(): path = context.get_or_create(new_content["iri"]) old_content = {} if os.path.exists(path): old_content = read_json(path) write_json(path, self._data_selector(old_content, new_content))