Ejemplo n.º 1
0
 def _get_sheets_list(self):
     '''
     Get a list of the sheets loaded in the triple store
     '''
     sparql = SPARQLWrap(self._conf.get_SPARQL())
     params = {'__RAW_DATA__': self._conf.get_graph_name('raw-data')}
     results = sparql.run_select(SHEETS_QUERY, params)
     datasets = [sparql.format(r['sheet']) for r in results]
     return datasets
Ejemplo n.º 2
0
 def _get_sheets_list(self):
     '''
     Get a list of the sheets loaded in the triple store
     '''
     sparql = SPARQLWrap(self._conf.get_SPARQL())
     params = {'__RAW_DATA__': self._conf.get_graph_name('raw-data')}
     results = sparql.run_select(SHEETS_QUERY, params)
     datasets = [sparql.format(r['sheet']) for r in results]
     return datasets
Ejemplo n.º 3
0
    def go(self, output_file_name):
        '''
        Compute all the statistics
        '''
        # Run all the queries
        results = {}
        if self.use_cache and os.path.isfile('/tmp/results.json'):
            log.info("Load cached data")
            with open('/tmp/results.json', 'r') as infile:
                results = json.load(infile)
        else:
            sparql = SPARQLWrap(self.end_point)
            for query_name in QUERIES:
                query_file = "{}/{}.sparql".format(os.path.dirname(__file__),
                                                   query_name)
                log.info("Execute %s" % query_file)
                query = open(query_file, 'r').read()
                r = sparql.run_select(query, self.sparql_params)
                parsed_results = self._parse_results(r)
                results[query_name] = parsed_results
                log.info("Results %s" % parsed_results)
            with open('/tmp/results.json', 'w') as outfile:
                json.dump(results, outfile)

        # Prepare the table with the overview for the sources
        table = {}
        for entry in results['parsed_sheets']:
            src = entry['src']
            table.setdefault(src, {})
            table[src]['sheets'] = "{}/{}".format(entry['nbsheetsparsed'],
                                                  entry['nbsheets'])
        for entry in results['tablinker_output']:
            src = entry['src']
            table.setdefault(src, {})
            header_type = entry['type'].replace(TABLINKER, 'tablinker:')
            table[src][header_type] = entry['total']

        # # Prepare the spider chart for the overview for the dimension used
        spider_labels = []
        spider_data = []
        for entry in results['dimension_usage']:
            spider_data.append(int(entry['nbobs']))
            spider_labels.append(str(entry['dimension']))

        # Process the template
        data = {
            'table': table,
            'spider': {
                'label': spider_labels,
                'data': spider_data
            }
        }
        tmpl_file_name = "{}/stats.html".format(os.path.dirname(__file__))
        template = Template(open(tmpl_file_name, 'r').read())
        with open(output_file_name, 'w') as outfile:
            outfile.write(template.render(data))
Ejemplo n.º 4
0
 def go(self, output_file_name):
     '''
     Compute all the statistics
     '''
     # Run all the queries
     results = {}
     if self.use_cache and os.path.isfile('/tmp/results.json'):
         log.info("Load cached data")
         with open('/tmp/results.json', 'r') as infile:
             results = json.load(infile)
     else:
         sparql = SPARQLWrap(self.end_point)
         for query_name in QUERIES:
             query_file = "{}/{}.sparql".format(os.path.dirname(__file__),
                                                query_name)
             log.info("Execute %s" % query_file)
             query = open(query_file, 'r').read()
             r = sparql.run_select(query, self.sparql_params)
             parsed_results = self._parse_results(r)
             results[query_name] = parsed_results
             log.info("Results %s" % parsed_results)
         with open('/tmp/results.json', 'w') as outfile:
             json.dump(results, outfile)
     
         
     # Prepare the table with the overview for the sources
     table = {}
     for entry in results['parsed_sheets']:
         src = entry['src']
         table.setdefault(src, {})
         table[src]['sheets'] = "{}/{}".format(entry['nbsheetsparsed'],
                                               entry['nbsheets'])
     for entry in results['tablinker_output']:
         src = entry['src']
         table.setdefault(src, {})
         header_type = entry['type'].replace(TABLINKER, 'tablinker:')
         table[src][header_type] = entry['total']
         
     # # Prepare the spider chart for the overview for the dimension used 
     spider_labels = []
     spider_data = []
     for entry in results['dimension_usage']:
         spider_data.append(int(entry['nbobs']))
         spider_labels.append(str(entry['dimension']))
         
     # Process the template
     data = {'table':table, 
             'spider':{ 'label': spider_labels, 'data':spider_data}}
     tmpl_file_name = "{}/stats.html".format(os.path.dirname(__file__))
     template = Template(open(tmpl_file_name, 'r').read())
     with open(output_file_name, 'w') as outfile:
         outfile.write(template.render(data))
Ejemplo n.º 5
0
    def loadHeaders(self, graph_name):
        '''
        This method fetches all the header used in the raw data and
        saves them as a cache in a CSV file
        '''
        # Load and execute the SPARQL query, save to the cache too
        sparql = SPARQLWrap(self.end_point)
        sparql_params = {'__DATA_SET__' : self.data_ns[self.dataset].n3(),
                         '__RAW_DATA__' : graph_name}
        results = sparql.run_select(HEADERS_QUERY, sparql_params)
        for result in results:
            # Parse the result
            cell = result['cell']['value']
            cell_name = cell.split('/')[-1]
            header_type = result['header_type']['value']
            dataset_name = result['dataset_name']['value']
            sheet_name = self.dataset.split('/')[-1] 
            literal = result['literal']['value']
            row = [cell_name, literal, header_type, cell_name, sheet_name, dataset_name]
            # Save to the headers list
            self.headers.append(row)

        log.info("[{}] Loaded {} headers".format(self.dataset, len(self.headers)))
Ejemplo n.º 6
0
 def _process_sheet(self, basename, n, sheet):
     """
     Process a sheet 
     """        
     log.debug('[{}] Load rules'.format(basename))
     annotations_map = {}
     # SPARQL Wrapper
     self.sparql = SPARQLWrap(self.end_point)
     sparql_params = {'__RULES__': self.rules_graph,
                      '__RAW_DATA__' : self.raw_data_graph,
                      '__FILE_NAME__' : Literal(basename).n3()}
     results = self.sparql.run_select(QUERY_ANNOTATIONS, sparql_params)
     for result in results:
         cell_name = result['cell_name']['value'].split('=')[0]
         po_pair = '{}={}'.format(result['p']['value'], result['o']['value'])
         
         annotations_map.setdefault(cell_name, office.Annotation())
         annot = annotations_map[cell_name]
         annot.addElement(P(text=po_pair))
         
     log.debug('[{}] Inject the annotations'.format(basename))
     rows = sheet.getElementsByType(TableRow)
     for rowIndex in range(0, len(rows)):
         cols = getColumns(rows[rowIndex])
         for colIndex in range(0, len(cols)):
             cell_obj = cols[colIndex]
             if cell_obj == None:
                 continue
                     
             # Get the cell name and the current style
             cell_name = colName(colIndex) + str(rowIndex + 1)
             
             if cell_name in annotations_map:
                 annot = annotations_map[cell_name]
                 log.debug('[{}] {} => {}'.format(basename, cell_name, annot))
                 cell_obj.addElement(annot)
Ejemplo n.º 7
0
class RulesInjector(object):
    def __init__(self, end_point, rules_graph, raw_data_graph):
        """
        Constructor
        """
        # Variables
        self.end_point = end_point
        self.rules_graph = rules_graph
        self.raw_data_graph = raw_data_graph
        
    def process_workbook(self, input_file_name, output_file_name):
        """
        Start processing all the sheets in workbook
        """
        # Base name for logging
        basename = os.path.basename(input_file_name)
        
        # Load the book
        log.info('[{}] Loading {}'.format(basename, input_file_name))
        book = load(unicode(input_file_name))
        
        # Go!
        log.debug('[{}] Starting RulesInjector'.format(basename))
        sheets = book.getElementsByType(Table)
        
        # Process all the sheets
        log.info('[{}] Found {} sheets to process'.format(basename, len(sheets)))
        for n in range(len(sheets)) :
            log.debug('[{}] Processing sheet {}'.format(basename, n))
            try:
                self._process_sheet(basename, n, sheets[n])
            except Exception as detail:
                log.error("[{}] Error processing sheet {} : {}".format(basename, n, detail))

        book.save(unicode(output_file_name))
        
    def _process_sheet(self, basename, n, sheet):
        """
        Process a sheet 
        """        
        log.debug('[{}] Load rules'.format(basename))
        annotations_map = {}
        # SPARQL Wrapper
        self.sparql = SPARQLWrap(self.end_point)
        sparql_params = {'__RULES__': self.rules_graph,
                         '__RAW_DATA__' : self.raw_data_graph,
                         '__FILE_NAME__' : Literal(basename).n3()}
        results = self.sparql.run_select(QUERY_ANNOTATIONS, sparql_params)
        for result in results:
            cell_name = result['cell_name']['value'].split('=')[0]
            po_pair = '{}={}'.format(result['p']['value'], result['o']['value'])
            
            annotations_map.setdefault(cell_name, office.Annotation())
            annot = annotations_map[cell_name]
            annot.addElement(P(text=po_pair))
            
        log.debug('[{}] Inject the annotations'.format(basename))
        rows = sheet.getElementsByType(TableRow)
        for rowIndex in range(0, len(rows)):
            cols = getColumns(rows[rowIndex])
            for colIndex in range(0, len(cols)):
                cell_obj = cols[colIndex]
                if cell_obj == None:
                    continue
                        
                # Get the cell name and the current style
                cell_name = colName(colIndex) + str(rowIndex + 1)
                
                if cell_name in annotations_map:
                    annot = annotations_map[cell_name]
                    log.debug('[{}] {} => {}'.format(basename, cell_name, annot))
                    cell_obj.addElement(annot)
Ejemplo n.º 8
0
 def generate_dsd(self, title, measure, measure_unit, slices, output_file):
     '''
     Save all additional files into ttl files. Contains data that span
     over all the processed raw cubes
     '''
     # The graph that will be used to store the cube
     graph = ConjunctiveGraph()
     graph.bind('prov', PROV)
     graph.bind('dcterms', DCTERMS)
     graph.bind('qb', QB)
     graph.bind('sdmx-dimension', SDMXDIMENSION)
     graph.bind('sdmx-attribute', SDMXATTRIBUTE)
     graph.bind('data', self.data_ns)
    
     # Create the data set description
     ds_uri = self.data_ns['harmonised-cube']
     graph.add((ds_uri, RDF.type, QB.DataSet))
     graph.add((ds_uri, RDF.type, PROV.Entity))
     graph.add((ds_uri, DCTERMS.title, Literal(title)))
     graph.add((ds_uri, RDFS.label, Literal(title)))
     
     # Create the DSD
     dsd_uri = ds_uri + '-dsd'
     graph.add((ds_uri, QB.structure, dsd_uri))
     graph.add((dsd_uri, RDF.type, QB.DataStructureDefinition))
     graph.add((dsd_uri, SDMXATTRIBUTE.unitMeasure, URIRef(measure_unit)))
     
     # Bind all the dimensions
     sparql = SPARQLWrap(self.end_point)
     params = {'__RELEASE__' : self.release_graph_name}
     results = sparql.run_select(QUERY_DIMS, params)
     dims = [URIRef(r['dim']['value']) for r in results]
     if URIRef(measure) in dims:
         dims.remove(URIRef(measure)) # We need to remove the measure
     for index in range(0,len(dims)):
         dim_uri = BNode()
         graph.add((dsd_uri, QB.component, dim_uri))
         graph.add((dim_uri, QB.dimension, dims[index]))
         graph.add((dim_uri, QB.order, Literal(index+1)))
     
     # Bind all the dimensions used in the slices too
     slice_dims = list(set([s['property'] for s in slices]))  
     for index in range(0, len(slice_dims)):
         dim_uri = BNode()
         graph.add((dsd_uri, QB.component, dim_uri))
         graph.add((dim_uri, QB.dimension, URIRef(slice_dims[index])))
         graph.add((dim_uri, QB.order, Literal(len(dims)+index+1)))
         graph.add((dim_uri, QB.componentAttachment, QB.Slice))
     
     # Bind the measure
     measure_uri = BNode()
     graph.add((dsd_uri, QB.component, measure_uri))
     graph.add((measure_uri, QB.measure, URIRef(measure)))
     
     # Bind the attributes
     attr_uri = BNode()
     graph.add((dsd_uri, QB.component, attr_uri))
     graph.add((attr_uri, QB.attribute, SDMXATTRIBUTE.unitMeasure))
     graph.add((attr_uri, QB.componentRequired, Literal("true", datatype=XSD.boolean)))
     graph.add((attr_uri, QB.componentAttachment, QB.DataSet))
     
     # Now create all the slices
     for index in range(0, len(slices)):
         # That's our slice
         s = slices[index]
         
         # Add a slice key to the DSD
         slice_uri = ds_uri + '-slice_' + str(index) 
         slicekey_uri = slice_uri + '-key'
         graph.add((dsd_uri, QB.sliceKey, slicekey_uri))
         graph.add((slicekey_uri, RDF.type, QB.SliceKey))
         graph.add((slicekey_uri, RDFS.label, Literal(s['title'])))
         graph.add((slicekey_uri, QB.componentProperty, URIRef(s['property'])))
         
         # Try to guess the type of the value
         casted_val = s['value']
         try:
             casted_val = int(casted_val)
         except ValueError:
             pass
         val = Literal(casted_val)
         
         # Describe the slice
         graph.add((slice_uri, RDF.type, QB.Slice))
         graph.add((slice_uri, QB.sliceStructure, slicekey_uri))
         graph.add((slice_uri, URIRef(s['property']), val))
     
         # Attach all the relevant observations to it
         sparql = SPARQLWrap(self.end_point)
         s2 = [Literal(s).n3() for s in s['sources']]
         params = {'__RELEASE__' : self.release_graph_name,
                   '__RAW_DATA__': self.raw_data_graph_name,
                   '__SOURCES__' : ','.join(s2)
                   }
         results = sparql.run_select(QUERY_MEMBER_OBS, params)
         for r in results:
             graph.add((slice_uri, QB.observation, URIRef(r['obs']['value'])))
             
     log.info("[{}] Contains {} triples".format(output_file, len(graph)))
     try :
         out = bz2.BZ2File(output_file + '.bz2', 'wb', compresslevel=9) if self.compress_output else open(output_file, "wb")
         graph.serialize(destination=out, format='n3')
         out.close()
     except :
         log.error("Whoops! Something went wrong in serializing to output file")
         log.info(sys.exc_info())
Ejemplo n.º 9
0
    def generate_dsd(self, title, measure, measure_unit, slices, output_file):
        '''
        Save all additional files into ttl files. Contains data that span
        over all the processed raw cubes
        '''
        # The graph that will be used to store the cube
        graph = ConjunctiveGraph()
        graph.bind('prov', PROV)
        graph.bind('dcterms', DCTERMS)
        graph.bind('qb', QB)
        graph.bind('sdmx-dimension', SDMXDIMENSION)
        graph.bind('sdmx-attribute', SDMXATTRIBUTE)
        graph.bind('data', self.data_ns)

        # Create the data set description
        ds_uri = self.data_ns['harmonised-cube']
        graph.add((ds_uri, RDF.type, QB.DataSet))
        graph.add((ds_uri, RDF.type, PROV.Entity))
        graph.add((ds_uri, DCTERMS.title, Literal(title)))
        graph.add((ds_uri, RDFS.label, Literal(title)))

        # Create the DSD
        dsd_uri = ds_uri + '-dsd'
        graph.add((ds_uri, QB.structure, dsd_uri))
        graph.add((dsd_uri, RDF.type, QB.DataStructureDefinition))
        graph.add((dsd_uri, SDMXATTRIBUTE.unitMeasure, URIRef(measure_unit)))

        # Bind all the dimensions
        sparql = SPARQLWrap(self.end_point)
        params = {'__RELEASE__': self.release_graph_name}
        results = sparql.run_select(QUERY_DIMS, params)
        dims = [URIRef(r['dim']['value']) for r in results]
        if URIRef(measure) in dims:
            dims.remove(URIRef(measure))  # We need to remove the measure
        for index in range(0, len(dims)):
            dim_uri = BNode()
            graph.add((dsd_uri, QB.component, dim_uri))
            graph.add((dim_uri, QB.dimension, dims[index]))
            graph.add((dim_uri, QB.order, Literal(index + 1)))

        # Bind all the dimensions used in the slices too
        slice_dims = list(set([s['property'] for s in slices]))
        for index in range(0, len(slice_dims)):
            dim_uri = BNode()
            graph.add((dsd_uri, QB.component, dim_uri))
            graph.add((dim_uri, QB.dimension, URIRef(slice_dims[index])))
            graph.add((dim_uri, QB.order, Literal(len(dims) + index + 1)))
            graph.add((dim_uri, QB.componentAttachment, QB.Slice))

        # Bind the measure
        measure_uri = BNode()
        graph.add((dsd_uri, QB.component, measure_uri))
        graph.add((measure_uri, QB.measure, URIRef(measure)))

        # Bind the attributes
        attr_uri = BNode()
        graph.add((dsd_uri, QB.component, attr_uri))
        graph.add((attr_uri, QB.attribute, SDMXATTRIBUTE.unitMeasure))
        graph.add((attr_uri, QB.componentRequired,
                   Literal("true", datatype=XSD.boolean)))
        graph.add((attr_uri, QB.componentAttachment, QB.DataSet))

        # Now create all the slices
        for index in range(0, len(slices)):
            # That's our slice
            s = slices[index]

            # Add a slice key to the DSD
            slice_uri = ds_uri + '-slice_' + str(index)
            slicekey_uri = slice_uri + '-key'
            graph.add((dsd_uri, QB.sliceKey, slicekey_uri))
            graph.add((slicekey_uri, RDF.type, QB.SliceKey))
            graph.add((slicekey_uri, RDFS.label, Literal(s['title'])))
            graph.add(
                (slicekey_uri, QB.componentProperty, URIRef(s['property'])))

            # Try to guess the type of the value
            casted_val = s['value']
            try:
                casted_val = int(casted_val)
            except ValueError:
                pass
            val = Literal(casted_val)

            # Describe the slice
            graph.add((slice_uri, RDF.type, QB.Slice))
            graph.add((slice_uri, QB.sliceStructure, slicekey_uri))
            graph.add((slice_uri, URIRef(s['property']), val))

            # Attach all the relevant observations to it
            sparql = SPARQLWrap(self.end_point)
            s2 = [Literal(s).n3() for s in s['sources']]
            params = {
                '__RELEASE__': self.release_graph_name,
                '__RAW_DATA__': self.raw_data_graph_name,
                '__SOURCES__': ','.join(s2)
            }
            results = sparql.run_select(QUERY_MEMBER_OBS, params)
            for r in results:
                graph.add(
                    (slice_uri, QB.observation, URIRef(r['obs']['value'])))

        log.info("[{}] Contains {} triples".format(output_file, len(graph)))
        try:
            out = bz2.BZ2File(
                output_file + '.bz2', 'wb',
                compresslevel=9) if self.compress_output else open(
                    output_file, "wb")
            graph.serialize(destination=out, format='n3')
            out.close()
        except:
            log.error(
                "Whoops! Something went wrong in serializing to output file")
            log.info(sys.exc_info())