Beispiel #1
0
def main():
    global num_tabel
    with io.open(sys.stdin.fileno(), 'r', encoding='utf-8',
                 errors='ignore') as stdin:
        for json_line in stdin:
            try:
                table_data = json.loads(json_line)
                table = Table(table_data)

                t = []
                col = [
                    c.find_annotations(anno_task='PropertyLinking')
                    for c in table.columns()
                ]

                for row in table.rows():

                    if row.find_annotations(
                            anno_task='EntityLinking') is not []:
                        i = 0
                        r = []
                        for cell in row:
                            if col[i]:
                                setProperties(cell, col, i)

                num_tabel += 1

            # ignore json decoding errors
            except JSONDecodeError:
                pass

    # Plot Propertys
    plt_bar(hist_props, used_props, 'Properties', 'Occurences',
            'hists/01_props_ov.html', 3000, '#Missing PropertyLinks',
            'Property')
Beispiel #2
0
def main():
    global num_tabel
    with io.open(sys.stdin.fileno(), 'r', encoding='utf-8',
                 errors='ignore') as stdin:
        for json_line in stdin:
            try:
                table_data = json.loads(json_line)
                table = Table(table_data)

                t = []
                col = [
                    c.find_annotations(anno_task='PropertyLinking')
                    for c in table.columns()
                ]

                for row in table.rows():

                    if row.find_annotations(
                            anno_task='EntityLinking') is not []:
                        i = 0
                        r = []
                        for cell in row:
                            if col[i]:
                                r = r + [[getEntity(cell)] +
                                         [getLiteral(cell, col, i)]]
                                i += 1
                        if r:
                            t = t + [r]

                if t:
                    analyzeTable(t)
                num_tabel += 1

            # ignore json decoding errors
            except JSONDecodeError:
                pass

    # Create histogramm for rows
    plt_bar(hist_row, lbl_col, 'Overall number of rows ',
            'Number of empty cells in a row', 'hists/01_ov_rows.html', 4000,
            '#Missing Annotations in row', '#How often this amount is missing')

    # Create histogramm columns
    plt_bar(hist_col, lbl_row, 'Overall number of cols',
            'Number of empty cells in a col', 'hists/01_ov_cols.html', 4000,
            '#Missing Annotations in column',
            '#How often this amount is missing')
Beispiel #3
0
    def run(self, table: Table):
        cellset = table.cells()

        if 'headerRowIndex' in table.table_data:
            header_row_index = table.table_data['headerRowIndex']
            if header_row_index != -1:
                cellset = cellset.where(lambda cell: cell.row_idx != header_row_index)

        # iterate over all cells
        for cell in cellset:
            if not cell.content:
                continue

            # always add a 'plain' annotation
            cell.annotations.append({
                'source': 'preprocessing',
                'task': 'LiteralNormalization',
                'type': 'plain',
                'string': cell.content,
            })

            # identify values with units
            unit_hypos = self.unit_parser.parse(cell.content)
            if unit_hypos:
                for unit_hypo in unit_hypos:
                    cell.annotations.append({
                        'source': 'preprocessing',
                        'task': 'LiteralNormalization',
                        'type': 'value and unit',
                        **unit_hypo
                    })
                continue

            # identify dates
            date_hypos = self.date_parser.parse(cell.content)
            if date_hypos:
                for date_hypo in date_hypos:
                    cell.annotations.append({
                        'source': 'preprocessing',
                        'task': 'LiteralNormalization',
                        'type': 'date',
                        **date_hypo
                    })
                continue

            # identify numbers
            numbers = self.numeric_parser.parse(cell.content)
            if numbers:
                for number in numbers:
                    cell.annotations.append({
                        'source': 'preprocessing',
                        'task': 'LiteralNormalization',
                        'type': 'numeric',
                        'number': number,
                    })
                continue

        return True
Beispiel #4
0
Datei: wtu.py Projekt: ag-sc/WTU
def process_line(json_line):
    try:
        table_data = json.loads(json_line)

        # create Table object from 'relation' field
        if 'relation' in table_data and len(table_data['relation']) > 0:
            # create table object from table_data
            table = Table(table_data)

            # run scheduled tasks
            for task in tasks_scheduled:
                if not task.run(table):
                    return None
            else:
                # output annotated table as json
                return json.dumps(table.dump())
        else:
            return None

    # ignore JSON decoding errors
    except JSONDecodeError:
        return None
Beispiel #5
0
def main():
    global num_tabel
    with io.open(sys.stdin.fileno(), 'r', encoding='utf-8', errors='ignore') as stdin:
        for json_line in stdin:
            try:
                table_data = json.loads(json_line)
                table = Table(table_data)

                t = []
                
                col = [c.find_annotations(anno_task = 'PropertyLinking') for c in table.columns()]

                for row in table.rows():
                    
                    if row.find_annotations(anno_source = 'preprocessing', anno_task='EntityLinking') is not []:
                        i = 0
                        r = []
                        for cell in row:
                            if col[i]:
                                r = r + [[getEntity_ng(cell)]+[getLiteral_ng(cell)]]
                                i += 1
                        if r:
                            t = t + [r]
                
                if t :
                    analyzeTable(t, col)

                num_tabel += 1

            except JSONDecodeError:
                pass

    print("Found Tables: ", found_tables, ", Number: ", num_found, "\n")

    with io.open('intresting_tables.json', 'w', encoding='utf-8', errors='ignore') as stdin:
        json.dump(found_tables_with_rows, stdin)
Beispiel #6
0
    def run(self, table: Table) -> None:
        cellset = table.cells()

        if 'headerRowIndex' in table.table_data:
            header_row_index = table.table_data['headerRowIndex']
            if header_row_index != -1:
                cellset = cellset.where(
                    lambda cell: cell.row_idx != header_row_index)

        # iterate over all cells
        for cell in cellset:
            # query the backend for mentions of the cell's content
            query_res = self.backend.query(cell.content)

            if self.fuzzy[0] and len(query_res) == 0:
                query_res = self.backend.fuzzy_search(
                    cell.content, fuzzy_cutoff=self.fuzzy[1])

            query_res_unique = Counter()
            for uri, freq in query_res:
                query_res_unique[uri.long()] += freq

            # get top <n> results (weighted by frequency of occurrence)
            top_n_res = sorted(query_res_unique.items(),
                               key=itemgetter(1),
                               reverse=True)[:self.top_n]

            # sum all frequencies to normalize the individual frequencies
            frequency_sum = sum(map(itemgetter(1), query_res))

            # add annotations for each identified entity
            for entity in top_n_res:
                uri, frequency = entity
                normalized_frequency = frequency / frequency_sum

                cell.annotations.append({
                    'source': 'preprocessing',
                    'task': 'EntityLinking',
                    'type': 'resource',
                    'resource_uri': uri,
                    'frequency': normalized_frequency,
                })

        return True
Beispiel #7
0
    gold_data_name,
    el_index_in_name, ll_index_in_name,
    el_index_out_name, ll_index_out_name
) = sys.argv[1:]

# read gold data, collect entities/properties
print('* reading gold annotated data from "{:s}"...'.format(gold_data_name))

gold_entities = set()
gold_properties = set()

with io.open(gold_data_name, 'r', encoding='utf-8', errors='ignore') as gold_data_fh:
    for table_json in gold_data_fh:
        try:
            table_data = json.loads(table_json)
            table = Table(table_data)

            # get EntityLinking annotations from cells
            for cell in table.cells():
                el_annotations = cell.find_annotations(
                    anno_source = 'gold-v2',
                    anno_task = 'EntityLinking'
                )
                for el_anno in el_annotations:
                    resource_uri = URI.parse(el_anno['resource_uri'], 'dbr')
                    gold_entities.add(resource_uri.short())

            # get PropertyLinking annotations from columns
            for column in table.columns():
                pl_annotations = column.find_annotations(
                    anno_source = 'gold-v2',
Beispiel #8
0
    tableNo = 0

    # iterate over input. Each line represents one table
    for json_line in stdin:

        # skip irrelevant tables
        if relevant_tables and str(tableNo) not in relevant_tables:
            print('skipping table #{:d}'.format(tableNo))
            tableNo += 1
            continue

        # parse the table from the json
        table_data = json.loads(json_line)
        # create Table object to work with
        table = Table(table_data)

        # create hgp for each row
        for row in table.rows():

            # skip irrelevant rows
            if relevant_tables:
                relevant_rows = relevant_tables[str(tableNo)]
                if row.row_idx not in relevant_rows:
                    print('skipping row #{:d} in table #{:d}'.format(
                        row.row_idx, tableNo))
                    continue

            # initialize hypothethis graph pattern (hgp) as empty list
            hgp = []
            # Entity-Dictionnairy that indicates which entity has which blank-node. key:entity-uri, value: blankNode
Beispiel #9
0
total_missed = 0
missed_no_anno = 0

missed_file = 'eva_prep.missed'
stats_file = 'eva_prep.stats'

# read from stdin, ignore encoding errors
with io.open(sys.stdin.fileno(), 'r', encoding='utf-8', errors='ignore') as stdin, \
io.open(missed_file, 'w') as missed_fh, io.open(stats_file, 'w') as stats_fh:
    # iterate over input. Each line represents one table
    for json_line in stdin:
        try:
            # parse json
            table_data = json.loads(json_line)
            # create Table object to work with
            table = Table(table_data)

            for cell in table.cells():
                gold_el = cell.find_annotations(anno_source=gold_source,
                                                anno_task='EntityLinking')
                if gold_el:
                    total_entities += 1
                    gold_el = gold_el[0]
                    gold_uri = gold_el['resource_uri']

                    preprocessing_uris = [
                        el_anno['resource_uri'] for el_anno in
                        cell.find_annotations(anno_source=preprocessing_source,
                                              anno_task='EntityLinking')
                    ]
Beispiel #10
0
    classes_reader = csv.reader(classes_fh, delimiter=',', quotechar='"')
    for class_row in classes_reader:
        table_name_ext, class_name, class_uri = class_row
        table_name = table_name_ext.split('.')[0]
        classes[table_name] = (class_name, class_uri)

# iterate over all tables
for table_name_ext in os.listdir(tables_dir):
    table_file = os.path.join(tables_dir, table_name_ext)
    table_name = os.path.splitext(table_name_ext)[0]

    # read table data & create Table object
    with io.open(table_file, 'r', encoding='utf-8',
                 errors='ignore') as table_fh:
        table_data = json.load(table_fh)
        table = Table(table_data)

    # add class annotation if available
    if table_name in classes:
        class_name, class_uri = classes[table_name]
        table.annotations.append({
            'source': 'gold-v2',
            'task': 'ClassLinking',
            'type': 'class',
            'class_name': class_name,
            'class_uri': class_uri
        })

    # add property annotations (columns)
    key_col_idx = None
    property_file = os.path.join(property_dir, table_name + '.csv')
Beispiel #11
0
# read from stdin, ignore encoding errors
with io.open(sys.stdin.fileno(), 'r', encoding='utf-8', errors='ignore') as stdin:

    # read the URIs and save in list. only if the gold URI is a URI from the list, then compare to our found annotation/URI.
    with open('properties_to_consider.txt') as f:
        uris_from_file = f.readlines()
    uris_from_file = [uri.strip('\n') for uri in uris_from_file]

    # iterate over input. Each line represents one table
    for json_line in stdin:
        try:
            # parse json
            table_data = json.loads(json_line)
            # create Table object to work with
            table = Table(table_data)

            table_amount_columns = 0
            column_same_uri = 0
            column_other_uri_NOT_IN_list = 0
            column_other_uri_IN_list = 0
            column_has_no_gold_uri = 0
            column_has_no_LL_anno = 0
            column_gold_uri_not_valid = 0

            print('-------------------------------------------------------------------------\n')
            print('TABLE BEGIN - Table '+ str(table_no) + '   total rows: '+str(table.num_rows)+ '   total cols: '+str(table.num_cols) +'\n')

            for column in table.columns():

                print('\n COLUMN BEGIN - Col '+ str(column.col_idx))