def _get_sheets_list(self): ''' Get a list of the sheets loaded in the triple store ''' sparql = SPARQLWrap(self._conf.get_SPARQL()) params = {'__RAW_DATA__': self._conf.get_graph_name('raw-data')} results = sparql.run_select(SHEETS_QUERY, params) datasets = [sparql.format(r['sheet']) for r in results] return datasets
def go(self, output_file_name): ''' Compute all the statistics ''' # Run all the queries results = {} if self.use_cache and os.path.isfile('/tmp/results.json'): log.info("Load cached data") with open('/tmp/results.json', 'r') as infile: results = json.load(infile) else: sparql = SPARQLWrap(self.end_point) for query_name in QUERIES: query_file = "{}/{}.sparql".format(os.path.dirname(__file__), query_name) log.info("Execute %s" % query_file) query = open(query_file, 'r').read() r = sparql.run_select(query, self.sparql_params) parsed_results = self._parse_results(r) results[query_name] = parsed_results log.info("Results %s" % parsed_results) with open('/tmp/results.json', 'w') as outfile: json.dump(results, outfile) # Prepare the table with the overview for the sources table = {} for entry in results['parsed_sheets']: src = entry['src'] table.setdefault(src, {}) table[src]['sheets'] = "{}/{}".format(entry['nbsheetsparsed'], entry['nbsheets']) for entry in results['tablinker_output']: src = entry['src'] table.setdefault(src, {}) header_type = entry['type'].replace(TABLINKER, 'tablinker:') table[src][header_type] = entry['total'] # # Prepare the spider chart for the overview for the dimension used spider_labels = [] spider_data = [] for entry in results['dimension_usage']: spider_data.append(int(entry['nbobs'])) spider_labels.append(str(entry['dimension'])) # Process the template data = { 'table': table, 'spider': { 'label': spider_labels, 'data': spider_data } } tmpl_file_name = "{}/stats.html".format(os.path.dirname(__file__)) template = Template(open(tmpl_file_name, 'r').read()) with open(output_file_name, 'w') as outfile: outfile.write(template.render(data))
def go(self, output_file_name): ''' Compute all the statistics ''' # Run all the queries results = {} if self.use_cache and os.path.isfile('/tmp/results.json'): log.info("Load cached data") with open('/tmp/results.json', 'r') as infile: results = json.load(infile) else: sparql = SPARQLWrap(self.end_point) for query_name in QUERIES: query_file = "{}/{}.sparql".format(os.path.dirname(__file__), query_name) log.info("Execute %s" % query_file) query = open(query_file, 'r').read() r = sparql.run_select(query, self.sparql_params) parsed_results = self._parse_results(r) results[query_name] = parsed_results log.info("Results %s" % parsed_results) with open('/tmp/results.json', 'w') as outfile: json.dump(results, outfile) # Prepare the table with the overview for the sources table = {} for entry in results['parsed_sheets']: src = entry['src'] table.setdefault(src, {}) table[src]['sheets'] = "{}/{}".format(entry['nbsheetsparsed'], entry['nbsheets']) for entry in results['tablinker_output']: src = entry['src'] table.setdefault(src, {}) header_type = entry['type'].replace(TABLINKER, 'tablinker:') table[src][header_type] = entry['total'] # # Prepare the spider chart for the overview for the dimension used spider_labels = [] spider_data = [] for entry in results['dimension_usage']: spider_data.append(int(entry['nbobs'])) spider_labels.append(str(entry['dimension'])) # Process the template data = {'table':table, 'spider':{ 'label': spider_labels, 'data':spider_data}} tmpl_file_name = "{}/stats.html".format(os.path.dirname(__file__)) template = Template(open(tmpl_file_name, 'r').read()) with open(output_file_name, 'w') as outfile: outfile.write(template.render(data))
def loadHeaders(self, graph_name): ''' This method fetches all the header used in the raw data and saves them as a cache in a CSV file ''' # Load and execute the SPARQL query, save to the cache too sparql = SPARQLWrap(self.end_point) sparql_params = {'__DATA_SET__' : self.data_ns[self.dataset].n3(), '__RAW_DATA__' : graph_name} results = sparql.run_select(HEADERS_QUERY, sparql_params) for result in results: # Parse the result cell = result['cell']['value'] cell_name = cell.split('/')[-1] header_type = result['header_type']['value'] dataset_name = result['dataset_name']['value'] sheet_name = self.dataset.split('/')[-1] literal = result['literal']['value'] row = [cell_name, literal, header_type, cell_name, sheet_name, dataset_name] # Save to the headers list self.headers.append(row) log.info("[{}] Loaded {} headers".format(self.dataset, len(self.headers)))
def _process_sheet(self, basename, n, sheet): """ Process a sheet """ log.debug('[{}] Load rules'.format(basename)) annotations_map = {} # SPARQL Wrapper self.sparql = SPARQLWrap(self.end_point) sparql_params = {'__RULES__': self.rules_graph, '__RAW_DATA__' : self.raw_data_graph, '__FILE_NAME__' : Literal(basename).n3()} results = self.sparql.run_select(QUERY_ANNOTATIONS, sparql_params) for result in results: cell_name = result['cell_name']['value'].split('=')[0] po_pair = '{}={}'.format(result['p']['value'], result['o']['value']) annotations_map.setdefault(cell_name, office.Annotation()) annot = annotations_map[cell_name] annot.addElement(P(text=po_pair)) log.debug('[{}] Inject the annotations'.format(basename)) rows = sheet.getElementsByType(TableRow) for rowIndex in range(0, len(rows)): cols = getColumns(rows[rowIndex]) for colIndex in range(0, len(cols)): cell_obj = cols[colIndex] if cell_obj == None: continue # Get the cell name and the current style cell_name = colName(colIndex) + str(rowIndex + 1) if cell_name in annotations_map: annot = annotations_map[cell_name] log.debug('[{}] {} => {}'.format(basename, cell_name, annot)) cell_obj.addElement(annot)
class RulesInjector(object): def __init__(self, end_point, rules_graph, raw_data_graph): """ Constructor """ # Variables self.end_point = end_point self.rules_graph = rules_graph self.raw_data_graph = raw_data_graph def process_workbook(self, input_file_name, output_file_name): """ Start processing all the sheets in workbook """ # Base name for logging basename = os.path.basename(input_file_name) # Load the book log.info('[{}] Loading {}'.format(basename, input_file_name)) book = load(unicode(input_file_name)) # Go! log.debug('[{}] Starting RulesInjector'.format(basename)) sheets = book.getElementsByType(Table) # Process all the sheets log.info('[{}] Found {} sheets to process'.format(basename, len(sheets))) for n in range(len(sheets)) : log.debug('[{}] Processing sheet {}'.format(basename, n)) try: self._process_sheet(basename, n, sheets[n]) except Exception as detail: log.error("[{}] Error processing sheet {} : {}".format(basename, n, detail)) book.save(unicode(output_file_name)) def _process_sheet(self, basename, n, sheet): """ Process a sheet """ log.debug('[{}] Load rules'.format(basename)) annotations_map = {} # SPARQL Wrapper self.sparql = SPARQLWrap(self.end_point) sparql_params = {'__RULES__': self.rules_graph, '__RAW_DATA__' : self.raw_data_graph, '__FILE_NAME__' : Literal(basename).n3()} results = self.sparql.run_select(QUERY_ANNOTATIONS, sparql_params) for result in results: cell_name = result['cell_name']['value'].split('=')[0] po_pair = '{}={}'.format(result['p']['value'], result['o']['value']) annotations_map.setdefault(cell_name, office.Annotation()) annot = annotations_map[cell_name] annot.addElement(P(text=po_pair)) log.debug('[{}] Inject the annotations'.format(basename)) rows = sheet.getElementsByType(TableRow) for rowIndex in range(0, len(rows)): cols = getColumns(rows[rowIndex]) for colIndex in range(0, len(cols)): cell_obj = cols[colIndex] if cell_obj == None: continue # Get the cell name and the current style cell_name = colName(colIndex) + str(rowIndex + 1) if cell_name in annotations_map: annot = annotations_map[cell_name] log.debug('[{}] {} => {}'.format(basename, cell_name, annot)) cell_obj.addElement(annot)
def generate_dsd(self, title, measure, measure_unit, slices, output_file): ''' Save all additional files into ttl files. Contains data that span over all the processed raw cubes ''' # The graph that will be used to store the cube graph = ConjunctiveGraph() graph.bind('prov', PROV) graph.bind('dcterms', DCTERMS) graph.bind('qb', QB) graph.bind('sdmx-dimension', SDMXDIMENSION) graph.bind('sdmx-attribute', SDMXATTRIBUTE) graph.bind('data', self.data_ns) # Create the data set description ds_uri = self.data_ns['harmonised-cube'] graph.add((ds_uri, RDF.type, QB.DataSet)) graph.add((ds_uri, RDF.type, PROV.Entity)) graph.add((ds_uri, DCTERMS.title, Literal(title))) graph.add((ds_uri, RDFS.label, Literal(title))) # Create the DSD dsd_uri = ds_uri + '-dsd' graph.add((ds_uri, QB.structure, dsd_uri)) graph.add((dsd_uri, RDF.type, QB.DataStructureDefinition)) graph.add((dsd_uri, SDMXATTRIBUTE.unitMeasure, URIRef(measure_unit))) # Bind all the dimensions sparql = SPARQLWrap(self.end_point) params = {'__RELEASE__' : self.release_graph_name} results = sparql.run_select(QUERY_DIMS, params) dims = [URIRef(r['dim']['value']) for r in results] if URIRef(measure) in dims: dims.remove(URIRef(measure)) # We need to remove the measure for index in range(0,len(dims)): dim_uri = BNode() graph.add((dsd_uri, QB.component, dim_uri)) graph.add((dim_uri, QB.dimension, dims[index])) graph.add((dim_uri, QB.order, Literal(index+1))) # Bind all the dimensions used in the slices too slice_dims = list(set([s['property'] for s in slices])) for index in range(0, len(slice_dims)): dim_uri = BNode() graph.add((dsd_uri, QB.component, dim_uri)) graph.add((dim_uri, QB.dimension, URIRef(slice_dims[index]))) graph.add((dim_uri, QB.order, Literal(len(dims)+index+1))) graph.add((dim_uri, QB.componentAttachment, QB.Slice)) # Bind the measure measure_uri = BNode() graph.add((dsd_uri, QB.component, measure_uri)) graph.add((measure_uri, QB.measure, URIRef(measure))) # Bind the attributes attr_uri = BNode() graph.add((dsd_uri, QB.component, attr_uri)) graph.add((attr_uri, QB.attribute, SDMXATTRIBUTE.unitMeasure)) graph.add((attr_uri, QB.componentRequired, Literal("true", datatype=XSD.boolean))) graph.add((attr_uri, QB.componentAttachment, QB.DataSet)) # Now create all the slices for index in range(0, len(slices)): # That's our slice s = slices[index] # Add a slice key to the DSD slice_uri = ds_uri + '-slice_' + str(index) slicekey_uri = slice_uri + '-key' graph.add((dsd_uri, QB.sliceKey, slicekey_uri)) graph.add((slicekey_uri, RDF.type, QB.SliceKey)) graph.add((slicekey_uri, RDFS.label, Literal(s['title']))) graph.add((slicekey_uri, QB.componentProperty, URIRef(s['property']))) # Try to guess the type of the value casted_val = s['value'] try: casted_val = int(casted_val) except ValueError: pass val = Literal(casted_val) # Describe the slice graph.add((slice_uri, RDF.type, QB.Slice)) graph.add((slice_uri, QB.sliceStructure, slicekey_uri)) graph.add((slice_uri, URIRef(s['property']), val)) # Attach all the relevant observations to it sparql = SPARQLWrap(self.end_point) s2 = [Literal(s).n3() for s in s['sources']] params = {'__RELEASE__' : self.release_graph_name, '__RAW_DATA__': self.raw_data_graph_name, '__SOURCES__' : ','.join(s2) } results = sparql.run_select(QUERY_MEMBER_OBS, params) for r in results: graph.add((slice_uri, QB.observation, URIRef(r['obs']['value']))) log.info("[{}] Contains {} triples".format(output_file, len(graph))) try : out = bz2.BZ2File(output_file + '.bz2', 'wb', compresslevel=9) if self.compress_output else open(output_file, "wb") graph.serialize(destination=out, format='n3') out.close() except : log.error("Whoops! Something went wrong in serializing to output file") log.info(sys.exc_info())
def generate_dsd(self, title, measure, measure_unit, slices, output_file): ''' Save all additional files into ttl files. Contains data that span over all the processed raw cubes ''' # The graph that will be used to store the cube graph = ConjunctiveGraph() graph.bind('prov', PROV) graph.bind('dcterms', DCTERMS) graph.bind('qb', QB) graph.bind('sdmx-dimension', SDMXDIMENSION) graph.bind('sdmx-attribute', SDMXATTRIBUTE) graph.bind('data', self.data_ns) # Create the data set description ds_uri = self.data_ns['harmonised-cube'] graph.add((ds_uri, RDF.type, QB.DataSet)) graph.add((ds_uri, RDF.type, PROV.Entity)) graph.add((ds_uri, DCTERMS.title, Literal(title))) graph.add((ds_uri, RDFS.label, Literal(title))) # Create the DSD dsd_uri = ds_uri + '-dsd' graph.add((ds_uri, QB.structure, dsd_uri)) graph.add((dsd_uri, RDF.type, QB.DataStructureDefinition)) graph.add((dsd_uri, SDMXATTRIBUTE.unitMeasure, URIRef(measure_unit))) # Bind all the dimensions sparql = SPARQLWrap(self.end_point) params = {'__RELEASE__': self.release_graph_name} results = sparql.run_select(QUERY_DIMS, params) dims = [URIRef(r['dim']['value']) for r in results] if URIRef(measure) in dims: dims.remove(URIRef(measure)) # We need to remove the measure for index in range(0, len(dims)): dim_uri = BNode() graph.add((dsd_uri, QB.component, dim_uri)) graph.add((dim_uri, QB.dimension, dims[index])) graph.add((dim_uri, QB.order, Literal(index + 1))) # Bind all the dimensions used in the slices too slice_dims = list(set([s['property'] for s in slices])) for index in range(0, len(slice_dims)): dim_uri = BNode() graph.add((dsd_uri, QB.component, dim_uri)) graph.add((dim_uri, QB.dimension, URIRef(slice_dims[index]))) graph.add((dim_uri, QB.order, Literal(len(dims) + index + 1))) graph.add((dim_uri, QB.componentAttachment, QB.Slice)) # Bind the measure measure_uri = BNode() graph.add((dsd_uri, QB.component, measure_uri)) graph.add((measure_uri, QB.measure, URIRef(measure))) # Bind the attributes attr_uri = BNode() graph.add((dsd_uri, QB.component, attr_uri)) graph.add((attr_uri, QB.attribute, SDMXATTRIBUTE.unitMeasure)) graph.add((attr_uri, QB.componentRequired, Literal("true", datatype=XSD.boolean))) graph.add((attr_uri, QB.componentAttachment, QB.DataSet)) # Now create all the slices for index in range(0, len(slices)): # That's our slice s = slices[index] # Add a slice key to the DSD slice_uri = ds_uri + '-slice_' + str(index) slicekey_uri = slice_uri + '-key' graph.add((dsd_uri, QB.sliceKey, slicekey_uri)) graph.add((slicekey_uri, RDF.type, QB.SliceKey)) graph.add((slicekey_uri, RDFS.label, Literal(s['title']))) graph.add( (slicekey_uri, QB.componentProperty, URIRef(s['property']))) # Try to guess the type of the value casted_val = s['value'] try: casted_val = int(casted_val) except ValueError: pass val = Literal(casted_val) # Describe the slice graph.add((slice_uri, RDF.type, QB.Slice)) graph.add((slice_uri, QB.sliceStructure, slicekey_uri)) graph.add((slice_uri, URIRef(s['property']), val)) # Attach all the relevant observations to it sparql = SPARQLWrap(self.end_point) s2 = [Literal(s).n3() for s in s['sources']] params = { '__RELEASE__': self.release_graph_name, '__RAW_DATA__': self.raw_data_graph_name, '__SOURCES__': ','.join(s2) } results = sparql.run_select(QUERY_MEMBER_OBS, params) for r in results: graph.add( (slice_uri, QB.observation, URIRef(r['obs']['value']))) log.info("[{}] Contains {} triples".format(output_file, len(graph))) try: out = bz2.BZ2File( output_file + '.bz2', 'wb', compresslevel=9) if self.compress_output else open( output_file, "wb") graph.serialize(destination=out, format='n3') out.close() except: log.error( "Whoops! Something went wrong in serializing to output file") log.info(sys.exc_info())