Exemple #1
0
    def generate_release(self):
        '''
        Get a list of data set to be processed and try to harmonised them into
        one big data cube
        '''
        # Prepare a task list
        tasks = []
        for sheet_name in self._get_sheets_list():
            output_file = self._conf.get_path('release') + sheet_name + '.ttl'
            task = {
                'sheet_name': sheet_name,
                'output_file': output_file,
                'endpoint': self._conf.get_SPARQL(),
                'compress': self._conf.isCompress(),
                'target': self._conf.get_namespace('data'),
                'release_graph': self._conf.get_graph_name('release'),
                'raw_data_graph': self._conf.get_graph_name('raw-data'),
                'rules_graph': self._conf.get_graph_name('rules'),
                'measure': self._conf.get_measure()
            }
            tasks.append(task)

        # Call cube in parallel, avoid hammering the store too much
        cpu_count = multiprocessing.cpu_count()
        pool = multiprocessing.Pool(processes=min(4, cpu_count))
        pool.map(generate_release_thread, tasks)
        pool.close()
        pool.join()

        # Push all the data to the triple store
        self._push_to_graph(self._conf.get_graph_name('release'),
                            self._conf.get_path('release'))

        # Create an instance of CubeMaker
        cubeMaker = CubeMaker(self._conf.get_SPARQL(),
                              self._conf.get_graph_name('release'),
                              self._conf.get_graph_name('raw-data'),
                              self._conf.get_graph_name('rules'))
        cubeMaker.set_target_namespace(self._conf.get_namespace('data'))
        cubeMaker.set_compress(self._conf.isCompress())

        # Update the DSD
        dsd_file_name = self._conf.get_path('release') + 'dsd.ttl'
        log.info("Asking CubeMaker to generate the DSD")
        cubeMaker.generate_dsd(self._conf.get_cube_title(),
                               self._conf.get_measure(),
                               self._conf.get_measureunit(),
                               self._conf.get_slices(), dsd_file_name)

        # Load the DSD
        pusher = Pusher(self._conf.get_SPARUL(), self._conf.get_user(),
                        self._conf.get_secret())
        log.info("[{}] Adding the content of the DSD".format(
            self._conf.get_graph_name('release')))
        if self._conf.isCompress():
            dsd_file_name = dsd_file_name + ".bz2"
        pusher.upload_file(self._conf.get_graph_name('release'), dsd_file_name)
Exemple #2
0
    def generate_release(self):
        '''
        Get a list of data set to be processed and try to harmonised them into
        one big data cube
        '''
        # Prepare a task list
        tasks = []
        for sheet_name in self._get_sheets_list():
            output_file = self._conf.get_path('release') + sheet_name + '.ttl'
            task = {'sheet_name'     : sheet_name,
                    'output_file'    : output_file,
                    'endpoint'       : self._conf.get_SPARQL(),
                    'compress'       : self._conf.isCompress(),
                    'target'         : self._conf.get_namespace('data'),
                    'release_graph'  : self._conf.get_graph_name('release'),
                    'raw_data_graph' : self._conf.get_graph_name('raw-data'),
                    'rules_graph'    : self._conf.get_graph_name('rules'),
                    'measure'        : self._conf.get_measure()}
            tasks.append(task)

        # Call cube in parallel, avoid hammering the store too much
        cpu_count = multiprocessing.cpu_count()
        pool = multiprocessing.Pool(processes=min(4, cpu_count))
        pool.map(generate_release_thread, tasks)
        pool.close()
        pool.join()
            
        # Push all the data to the triple store
        self._push_to_graph(self._conf.get_graph_name('release'),
                            self._conf.get_path('release'))
    
        # Create an instance of CubeMaker
        cubeMaker = CubeMaker(self._conf.get_SPARQL(),
                              self._conf.get_graph_name('release'),
                              self._conf.get_graph_name('raw-data'),
                              self._conf.get_graph_name('rules'))
        cubeMaker.set_target_namespace(self._conf.get_namespace('data'))
        cubeMaker.set_compress(self._conf.isCompress())
        
        # Update the DSD
        dsd_file_name = self._conf.get_path('release') + 'dsd.ttl'
        log.info("Asking CubeMaker to generate the DSD")
        cubeMaker.generate_dsd(self._conf.get_cube_title(),
                               self._conf.get_measure(),
                               self._conf.get_measureunit(),
                               self._conf.get_slices(),
                               dsd_file_name)
        
        # Load the DSD
        pusher = Pusher(self._conf.get_SPARUL(),
                        self._conf.get_user(),
                        self._conf.get_secret())
        log.info("[{}] Adding the content of the DSD".format(self._conf.get_graph_name('release')))
        if self._conf.isCompress():
            dsd_file_name = dsd_file_name + ".bz2"
        pusher.upload_file(self._conf.get_graph_name('release'), dsd_file_name)
Exemple #3
0
 def _push_to_graph(self, named_graph, directory):
     '''
     Push data to to the triple store
     '''
     pusher = Pusher(self._conf.get_SPARUL(),
                     self._conf.get_user(),
                     self._conf.get_secret())
     log.info("[{}] Cleaning the content of the graph ".format(named_graph))
     pusher.clean_graph(named_graph)
     log.info("[{}] Loading files in {}".format(named_graph, directory))
     for input_file in sorted(glob.glob(directory + '/*')):
         log.info("[{}] Loading {}".format(named_graph, input_file))
         pusher.upload_file(named_graph, input_file)
     log.info("[{}] Done loading data".format(named_graph))
Exemple #4
0
 def _push_to_graph(self, named_graph, directory):
     '''
     Push data to to the triple store
     '''
     pusher = Pusher(self._conf.get_SPARUL(), self._conf.get_user(),
                     self._conf.get_secret())
     log.info("[{}] Cleaning the content of the graph ".format(named_graph))
     pusher.clean_graph(named_graph)
     log.info("[{}] Loading files in {}".format(named_graph, directory))
     for input_file in sorted(glob.glob(directory + '/*')):
         log.info("[{}] Loading {}".format(named_graph, input_file))
         pusher.upload_file(named_graph, input_file)
     log.info("[{}] Done loading data".format(named_graph))