def query(self, entity_uri): entity_uri = URI.parse(entity_uri) res = [] try: res = self.index[entity_uri.short()] except KeyError: pass return res
def run(self, table): # iterate over all rows for row in table.rows(): # find all 'entity' cell in the current row el_cells = [] for cell in row: # el_annos = cell.find_annotations(anno_source='preprocessing', anno_task='EntityLinking') # if el_annos: # el_cells.append((cell, el_annos)) el_annos = [] for cell_anno_idx, cell_anno in enumerate(cell.annotations): if cell_anno['source'] == 'preprocessing' and cell_anno[ 'task'] == 'EntityLinking': el_annos.append((cell_anno_idx, cell_anno)) if el_annos: el_cells.append((cell, el_annos)) # iterate over all 'entity' cells for el_cell, el_annos in el_cells: # iterate over all EL annotations of this cell for el_anno_idx, el_anno in el_annos: # query the backend for the set of this entitie's properties, # skip this entity if there are none properties = self.backend.query(el_anno['resource_uri']) if not properties: continue # iterate over all other cells in this row # i.e. not the one containing the entity for other_cell in row: if other_cell.idx == el_cell.idx: continue # try to match the cell's content with one of # the entity's properties matching_properties = self.match_properties( other_cell, properties) for property_uri, match_infos in matching_properties.items( ): property_uri = URI.parse(property_uri) for match_info in match_infos: other_cell.annotations.append({ 'source': 'preprocessing', 'task': 'LiteralLinking', 'type': 'property', 'property_uri': property_uri.long(), 'references_el': '{:d}:{:d}/{:d}'.format( *el_cell.idx, el_anno_idx), **match_info }) return True
def match_properties(self, cell, properties): matching_properties = defaultdict(list) # iterate over all properties for property_uri, property_type, property_value in properties: for anno_idx, anno in enumerate(cell.annotations): # iterate over the cell's LiteralNormalization annotations (if it has any) # and use the normalized cell value for comparisons against the index if anno['task'] == 'LiteralNormalization': ln_anno_idx = '{:d}:{:d}/{:d}'.format(*cell.idx, anno_idx) ln_type = anno['type'] transformations = [] # distinguish between the different kinds of LiteralNormalization annotations # and collect their transformations/metric scores if ln_type == 'date': date_parts = { k: anno[k] for k in ['year', 'month', 'day_of_month'] } transformations = self.match_date( property_type, property_value, date_parts) elif ln_type == 'numeric': transformations = self.match_numeric( property_value, anno['number']) elif ln_type == 'value and unit': transformations = self.match_value_unit( property_value, anno['value'], anno['value_normalized']) elif ln_type == 'plain': transformations = self.match_string( property_value, anno['string']) if transformations: index_type = '' if len(property_type) > 0: index_type = URI.parse(property_type).long() matching_properties[property_uri].append({ 'references_ln': ln_anno_idx, 'transformations': transformations, 'index_value': property_value, 'index_type': index_type, }) return matching_properties
def __init__(self, index_file, delimiter='\t', quotechar=None): self.index = {} with io.open(index_file, 'r', encoding='utf-8', errors='ignore') as index_fh: csv_reader = csv.reader(index_fh, delimiter=delimiter, quotechar=quotechar) for row in csv_reader: mention, uri = row self.index[mention.lower()] = URI.parse(uri, 'dbo').short()
def __init__(self, index_file, delimiter='\t', quotechar=None): # index dictionary self.index = defaultdict(list) # read complete `index_file` into the index dictionary with io.open(index_file, 'r', encoding='utf-8', errors='ignore') as index_fh: csv_reader = csv.reader(index_fh, delimiter=delimiter, quotechar=quotechar) for row in csv_reader: entity_uri, property_uri, literal_type, literal_value = row entity_uri = URI.parse(entity_uri, 'dbr') self.index[entity_uri.short()].append( (property_uri, literal_type, literal_value))
def __init__(self, index_file, delimiter='\t', quotechar=None): # index dictionary self.index = defaultdict(list) # read complete `index_file` into the index dictionary with io.open(index_file, 'r', encoding='utf-8', errors='ignore') as index_fh: csv_reader = csv.reader(index_fh, delimiter=delimiter, quotechar=quotechar) for row in csv_reader: mention, uri, frequency = row mention = preprocess_mention(mention) if mention: uri = URI.parse(uri, 'dbr') self.index[mention].append((uri, int(frequency)))
def run(self, table): if 'headerRowIndex' in table.table_data: header_row_index = table.table_data['headerRowIndex'] if header_row_index != -1: header_row = table.rows()[header_row_index] for cell in header_row: class_uri = self.backend.query(cell.content) if class_uri is not None: class_uri = URI.parse(class_uri, 'dbo') cell.annotations.append({ 'source': 'preprocessing', 'task': 'ClassLinking', 'type': 'class', 'class_uri': class_uri.long(), }) return True
gold_properties = set() with io.open(gold_data_name, 'r', encoding='utf-8', errors='ignore') as gold_data_fh: for table_json in gold_data_fh: try: table_data = json.loads(table_json) table = Table(table_data) # get EntityLinking annotations from cells for cell in table.cells(): el_annotations = cell.find_annotations( anno_source = 'gold-v2', anno_task = 'EntityLinking' ) for el_anno in el_annotations: resource_uri = URI.parse(el_anno['resource_uri'], 'dbr') gold_entities.add(resource_uri.short()) # get PropertyLinking annotations from columns for column in table.columns(): pl_annotations = column.find_annotations( anno_source = 'gold-v2', anno_task = 'PropertyLinking' ) for pl_anno in pl_annotations: property_uri = URI.parse(pl_anno['property_uri']) gold_properties.add(property_uri.short()) except JSONDecodeError: pass # summary