Beispiel #1
0
    def handleAnnotation(self, cell, annotation) :
        """
        Create relevant triples for the annotation attached to the cell
        """
        
        # Create triples according to Open Annotation model
        annotation_URI = cell['URI'] + "-oa"
        annotation_body_URI = annotation_URI + '-body'

        self.graph.add((annotation_URI, RDF.type, OA.Annotation))
        self.graph.add((annotation_URI, OA.hasTarget, cell['URI']))
        self.graph.add((annotation_URI, OA.hasBody, annotation_body_URI))
        
        self.graph.add((annotation_body_URI, RDF.type, RDFS.Resource))
        self.graph.add((annotation_body_URI,
                        TABLINKER.value,
                        Literal(clean_string(getText(annotation)))))
        
        # Extract author
        author = annotation.getElementsByType(dc.Creator)
        if len(author) > 0:
            author = clean_string(str(author[0]))
            self.graph.add((annotation_body_URI, OA.annotatedBy, Literal(author)))
            
        # Extract date
        creation_date = annotation.getElementsByType(dc.Date)
        if len(creation_date) > 0:
            creation_date = str(creation_date[0])
            self.graph.add((annotation_body_URI, OA.serializedAt, Literal(creation_date, datatype=XSD.date)))
Beispiel #2
0
    def parseSheet(self, n, sheet):
        """
        Parses the currently selected sheet in the workbook, takes no arguments. Iterates over all cells in the Excel sheet and produces relevant RDF Triples. 
        """        
        # Define a sheetURI for the current sheet
        sheetURI = self.data_ns["{0}-S{1}".format(self.basename, n)]       
        
        columnDimensions = {}
        row_dims = {}
        rowProperties = {}
        marked_count = 0
        
        rows = sheet.getElementsByType(TableRow)
        for rowIndex in range(0, len(rows)):
            cols = getColumns(rows[rowIndex])
            for colIndex in range(0, len(cols)):
                cell_obj = cols[colIndex]
                
                if cell_obj == None:
                    continue
                        
                # Get the cell name and the current style
                cellName = colName(colIndex) + str(rowIndex + 1)
                
                if len(cell_obj.getElementsByType(text.P)) == 0:
                    literal = ''
                else:
                    literal = getText(cell_obj)
                    if type(literal) == type(1.0):
                        if literal.is_integer():
                            literal = str(int(literal))
                        else:
                            literal = str(float(literal))
                
                cell = {
                    # Coordinates
                    'i' : rowIndex,
                    'j' : colIndex,
                    # The cell itself
                    'cell' : cell_obj,
                    # The sheet
                    'sheet' : sheet,
                    # The name of the cell
                    'name' : cellName,
                    # The type of the cell
                    'type' : self.getStyle(cell_obj),
                    # The (cleaned) value of the cell
                    'value' : str(literal),
                    # Is empty ?
                    'isEmpty' : str(literal) == '',
                    # Compose a resource name for the cell
                    'URI' : URIRef("{0}-{1}".format(sheetURI, cellName)),
                    # Pass on the URI of the data set
                    'sheetURI' : sheetURI
                }
                
                # logger.debug("({},{}) {}/{}: \"{}\"". format(i, j, cellType, cellName, cellValue))

                # Increase the counter of marked cells
                if cell['type'] in ['TL Data', 'TL RowHeader', 'TL HRowHeader', 'TL ColHeader', 'TL RowProperty']:
                    marked_count = marked_count + 1
                    
                # Parse cell content
                if cell['type'] == 'TL Data':
                    self.handleData(cell, columnDimensions, row_dims)
                elif cell['type'] == 'TL RowHeader' :
                    self.handleRowHeader(cell, row_dims, rowProperties)
                elif cell['type'] == 'TL HRowHeader' :
                    self.handleHRowHeader(cell, row_dims, rowProperties)
                elif cell['type'] == 'TL ColHeader' :
                    self.handleColHeader(cell, columnDimensions)
                elif cell['type'] == 'TL RowProperty' :
                    self.handleRowProperty(cell, rowProperties)
                elif cell['type'] == 'TL Title' :
                    self.handleTitle(cell)

                # Parse annotation if any and if their processing is enabled
                annotations = cell_obj.getElementsByType(office.Annotation)
                if len(annotations) != 0:
                    self.handleAnnotation(cell, annotations[0])
                
        # Relate all the row properties to their row headers
        for rowDimension in row_dims:
                for (p, vs) in row_dims[rowDimension].iteritems():
                    for v in vs:
                        try:
                            self.graph.add((v, TABLINKER.parentCell, p))
                        except exceptions.AssertionError:
                            logger.debug('Ignore {}'.format(p))
                            
        # Add additional information about the hierarchy of column headers
        # for value in columnDimensions.values():
        #    for index in range(1, len(value)):
        #        uri_sub = self.getColHeaderValueURI(value[:index + 1])
        #        uri_top = self.getColHeaderValueURI(value[:index])
        #        self.graph.add((uri_sub, self.namespaces['tablink']['subColHeaderOf'], uri_top))
        
        return (sheetURI, marked_count)