def generate_release(self): ''' Get a list of data set to be processed and try to harmonised them into one big data cube ''' # Prepare a task list tasks = [] for sheet_name in self._get_sheets_list(): output_file = self._conf.get_path('release') + sheet_name + '.ttl' task = { 'sheet_name': sheet_name, 'output_file': output_file, 'endpoint': self._conf.get_SPARQL(), 'compress': self._conf.isCompress(), 'target': self._conf.get_namespace('data'), 'release_graph': self._conf.get_graph_name('release'), 'raw_data_graph': self._conf.get_graph_name('raw-data'), 'rules_graph': self._conf.get_graph_name('rules'), 'measure': self._conf.get_measure() } tasks.append(task) # Call cube in parallel, avoid hammering the store too much cpu_count = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=min(4, cpu_count)) pool.map(generate_release_thread, tasks) pool.close() pool.join() # Push all the data to the triple store self._push_to_graph(self._conf.get_graph_name('release'), self._conf.get_path('release')) # Create an instance of CubeMaker cubeMaker = CubeMaker(self._conf.get_SPARQL(), self._conf.get_graph_name('release'), self._conf.get_graph_name('raw-data'), self._conf.get_graph_name('rules')) cubeMaker.set_target_namespace(self._conf.get_namespace('data')) cubeMaker.set_compress(self._conf.isCompress()) # Update the DSD dsd_file_name = self._conf.get_path('release') + 'dsd.ttl' log.info("Asking CubeMaker to generate the DSD") cubeMaker.generate_dsd(self._conf.get_cube_title(), self._conf.get_measure(), self._conf.get_measureunit(), self._conf.get_slices(), dsd_file_name) # Load the DSD pusher = Pusher(self._conf.get_SPARUL(), self._conf.get_user(), self._conf.get_secret()) log.info("[{}] Adding the content of the DSD".format( self._conf.get_graph_name('release'))) if self._conf.isCompress(): dsd_file_name = dsd_file_name + ".bz2" pusher.upload_file(self._conf.get_graph_name('release'), dsd_file_name)
def generate_release(self): ''' Get a list of data set to be processed and try to harmonised them into one big data cube ''' # Prepare a task list tasks = [] for sheet_name in self._get_sheets_list(): output_file = self._conf.get_path('release') + sheet_name + '.ttl' task = {'sheet_name' : sheet_name, 'output_file' : output_file, 'endpoint' : self._conf.get_SPARQL(), 'compress' : self._conf.isCompress(), 'target' : self._conf.get_namespace('data'), 'release_graph' : self._conf.get_graph_name('release'), 'raw_data_graph' : self._conf.get_graph_name('raw-data'), 'rules_graph' : self._conf.get_graph_name('rules'), 'measure' : self._conf.get_measure()} tasks.append(task) # Call cube in parallel, avoid hammering the store too much cpu_count = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=min(4, cpu_count)) pool.map(generate_release_thread, tasks) pool.close() pool.join() # Push all the data to the triple store self._push_to_graph(self._conf.get_graph_name('release'), self._conf.get_path('release')) # Create an instance of CubeMaker cubeMaker = CubeMaker(self._conf.get_SPARQL(), self._conf.get_graph_name('release'), self._conf.get_graph_name('raw-data'), self._conf.get_graph_name('rules')) cubeMaker.set_target_namespace(self._conf.get_namespace('data')) cubeMaker.set_compress(self._conf.isCompress()) # Update the DSD dsd_file_name = self._conf.get_path('release') + 'dsd.ttl' log.info("Asking CubeMaker to generate the DSD") cubeMaker.generate_dsd(self._conf.get_cube_title(), self._conf.get_measure(), self._conf.get_measureunit(), self._conf.get_slices(), dsd_file_name) # Load the DSD pusher = Pusher(self._conf.get_SPARUL(), self._conf.get_user(), self._conf.get_secret()) log.info("[{}] Adding the content of the DSD".format(self._conf.get_graph_name('release'))) if self._conf.isCompress(): dsd_file_name = dsd_file_name + ".bz2" pusher.upload_file(self._conf.get_graph_name('release'), dsd_file_name)
def generate_release_thread(parameters): ''' Worker thread for generate_release ''' sheet_name = parameters['sheet_name'] output_file = parameters['output_file'] log.info("[{}] Calling CubeMaker".format(sheet_name)) try: cubeMaker = CubeMaker(parameters['endpoint'], parameters['release_graph'], parameters['raw_data_graph'], parameters['rules_graph']) cubeMaker.set_target_namespace(parameters['target']) cubeMaker.set_compress(parameters['compress']) cubeMaker.process(parameters['measure'], sheet_name, output_file) except Exception as e: log.error("[{}] Error in CubeMaker: {}".format(sheet_name, e))