Ejemplo n.º 1
0
    def query(self, entity_uri):
        entity_uri = URI.parse(entity_uri)
        res = []
        try:
            res = self.index[entity_uri.short()]
        except KeyError:
            pass

        return res
Ejemplo n.º 2
0
    def run(self, table):
        # iterate over all rows
        for row in table.rows():
            # find all 'entity' cell in the current row
            el_cells = []
            for cell in row:
                # el_annos = cell.find_annotations(anno_source='preprocessing', anno_task='EntityLinking')
                # if el_annos:
                #     el_cells.append((cell, el_annos))
                el_annos = []
                for cell_anno_idx, cell_anno in enumerate(cell.annotations):
                    if cell_anno['source'] == 'preprocessing' and cell_anno[
                            'task'] == 'EntityLinking':
                        el_annos.append((cell_anno_idx, cell_anno))
                if el_annos:
                    el_cells.append((cell, el_annos))

            # iterate over all 'entity' cells
            for el_cell, el_annos in el_cells:
                # iterate over all EL annotations of this cell
                for el_anno_idx, el_anno in el_annos:
                    # query the backend for the set of this entitie's properties,
                    # skip this entity if there are none
                    properties = self.backend.query(el_anno['resource_uri'])
                    if not properties:
                        continue

                    # iterate over all other cells in this row
                    # i.e. not the one containing the entity
                    for other_cell in row:
                        if other_cell.idx == el_cell.idx:
                            continue

                        # try to match the cell's content with one of
                        # the entity's properties
                        matching_properties = self.match_properties(
                            other_cell, properties)
                        for property_uri, match_infos in matching_properties.items(
                        ):
                            property_uri = URI.parse(property_uri)
                            for match_info in match_infos:
                                other_cell.annotations.append({
                                    'source':
                                    'preprocessing',
                                    'task':
                                    'LiteralLinking',
                                    'type':
                                    'property',
                                    'property_uri':
                                    property_uri.long(),
                                    'references_el':
                                    '{:d}:{:d}/{:d}'.format(
                                        *el_cell.idx, el_anno_idx),
                                    **match_info
                                })
        return True
Ejemplo n.º 3
0
    def match_properties(self, cell, properties):
        matching_properties = defaultdict(list)

        # iterate over all properties
        for property_uri, property_type, property_value in properties:
            for anno_idx, anno in enumerate(cell.annotations):
                # iterate over the cell's LiteralNormalization annotations (if it has any)
                # and use the normalized cell value for comparisons against the index
                if anno['task'] == 'LiteralNormalization':
                    ln_anno_idx = '{:d}:{:d}/{:d}'.format(*cell.idx, anno_idx)
                    ln_type = anno['type']
                    transformations = []

                    # distinguish between the different kinds of LiteralNormalization annotations
                    # and collect their transformations/metric scores

                    if ln_type == 'date':
                        date_parts = {
                            k: anno[k]
                            for k in ['year', 'month', 'day_of_month']
                        }
                        transformations = self.match_date(
                            property_type, property_value, date_parts)

                    elif ln_type == 'numeric':
                        transformations = self.match_numeric(
                            property_value, anno['number'])

                    elif ln_type == 'value and unit':
                        transformations = self.match_value_unit(
                            property_value, anno['value'],
                            anno['value_normalized'])

                    elif ln_type == 'plain':
                        transformations = self.match_string(
                            property_value, anno['string'])

                    if transformations:
                        index_type = ''
                        if len(property_type) > 0:
                            index_type = URI.parse(property_type).long()

                        matching_properties[property_uri].append({
                            'references_ln':
                            ln_anno_idx,
                            'transformations':
                            transformations,
                            'index_value':
                            property_value,
                            'index_type':
                            index_type,
                        })

        return matching_properties
Ejemplo n.º 4
0
    def __init__(self, index_file, delimiter='\t', quotechar=None):
        self.index = {}

        with io.open(index_file, 'r', encoding='utf-8',
                     errors='ignore') as index_fh:
            csv_reader = csv.reader(index_fh,
                                    delimiter=delimiter,
                                    quotechar=quotechar)
            for row in csv_reader:
                mention, uri = row
                self.index[mention.lower()] = URI.parse(uri, 'dbo').short()
Ejemplo n.º 5
0
    def __init__(self, index_file, delimiter='\t', quotechar=None):
        # index dictionary
        self.index = defaultdict(list)

        # read complete `index_file` into the index dictionary
        with io.open(index_file, 'r', encoding='utf-8',
                     errors='ignore') as index_fh:
            csv_reader = csv.reader(index_fh,
                                    delimiter=delimiter,
                                    quotechar=quotechar)
            for row in csv_reader:
                entity_uri, property_uri, literal_type, literal_value = row
                entity_uri = URI.parse(entity_uri, 'dbr')
                self.index[entity_uri.short()].append(
                    (property_uri, literal_type, literal_value))
Ejemplo n.º 6
0
    def __init__(self, index_file, delimiter='\t', quotechar=None):
        # index dictionary
        self.index = defaultdict(list)

        # read complete `index_file` into the index dictionary
        with io.open(index_file, 'r', encoding='utf-8',
                     errors='ignore') as index_fh:
            csv_reader = csv.reader(index_fh,
                                    delimiter=delimiter,
                                    quotechar=quotechar)
            for row in csv_reader:
                mention, uri, frequency = row
                mention = preprocess_mention(mention)
                if mention:
                    uri = URI.parse(uri, 'dbr')
                    self.index[mention].append((uri, int(frequency)))
Ejemplo n.º 7
0
    def run(self, table):
        if 'headerRowIndex' in table.table_data:
            header_row_index = table.table_data['headerRowIndex']
            if header_row_index != -1:
                header_row = table.rows()[header_row_index]
                for cell in header_row:
                    class_uri = self.backend.query(cell.content)
                    if class_uri is not None:
                        class_uri = URI.parse(class_uri, 'dbo')
                        cell.annotations.append({
                            'source': 'preprocessing',
                            'task': 'ClassLinking',
                            'type': 'class',
                            'class_uri': class_uri.long(),
                        })

        return True
Ejemplo n.º 8
0
gold_properties = set()

with io.open(gold_data_name, 'r', encoding='utf-8', errors='ignore') as gold_data_fh:
    for table_json in gold_data_fh:
        try:
            table_data = json.loads(table_json)
            table = Table(table_data)

            # get EntityLinking annotations from cells
            for cell in table.cells():
                el_annotations = cell.find_annotations(
                    anno_source = 'gold-v2',
                    anno_task = 'EntityLinking'
                )
                for el_anno in el_annotations:
                    resource_uri = URI.parse(el_anno['resource_uri'], 'dbr')
                    gold_entities.add(resource_uri.short())

            # get PropertyLinking annotations from columns
            for column in table.columns():
                pl_annotations = column.find_annotations(
                    anno_source = 'gold-v2',
                    anno_task = 'PropertyLinking'
                )
                for pl_anno in pl_annotations:
                    property_uri = URI.parse(pl_anno['property_uri'])
                    gold_properties.add(property_uri.short())
        except JSONDecodeError:
            pass

# summary