def main(): global num_tabel with io.open(sys.stdin.fileno(), 'r', encoding='utf-8', errors='ignore') as stdin: for json_line in stdin: try: table_data = json.loads(json_line) table = Table(table_data) t = [] col = [ c.find_annotations(anno_task='PropertyLinking') for c in table.columns() ] for row in table.rows(): if row.find_annotations( anno_task='EntityLinking') is not []: i = 0 r = [] for cell in row: if col[i]: setProperties(cell, col, i) num_tabel += 1 # ignore json decoding errors except JSONDecodeError: pass # Plot Propertys plt_bar(hist_props, used_props, 'Properties', 'Occurences', 'hists/01_props_ov.html', 3000, '#Missing PropertyLinks', 'Property')
def main(): global num_tabel with io.open(sys.stdin.fileno(), 'r', encoding='utf-8', errors='ignore') as stdin: for json_line in stdin: try: table_data = json.loads(json_line) table = Table(table_data) t = [] col = [ c.find_annotations(anno_task='PropertyLinking') for c in table.columns() ] for row in table.rows(): if row.find_annotations( anno_task='EntityLinking') is not []: i = 0 r = [] for cell in row: if col[i]: r = r + [[getEntity(cell)] + [getLiteral(cell, col, i)]] i += 1 if r: t = t + [r] if t: analyzeTable(t) num_tabel += 1 # ignore json decoding errors except JSONDecodeError: pass # Create histogramm for rows plt_bar(hist_row, lbl_col, 'Overall number of rows ', 'Number of empty cells in a row', 'hists/01_ov_rows.html', 4000, '#Missing Annotations in row', '#How often this amount is missing') # Create histogramm columns plt_bar(hist_col, lbl_row, 'Overall number of cols', 'Number of empty cells in a col', 'hists/01_ov_cols.html', 4000, '#Missing Annotations in column', '#How often this amount is missing')
def run(self, table: Table): cellset = table.cells() if 'headerRowIndex' in table.table_data: header_row_index = table.table_data['headerRowIndex'] if header_row_index != -1: cellset = cellset.where(lambda cell: cell.row_idx != header_row_index) # iterate over all cells for cell in cellset: if not cell.content: continue # always add a 'plain' annotation cell.annotations.append({ 'source': 'preprocessing', 'task': 'LiteralNormalization', 'type': 'plain', 'string': cell.content, }) # identify values with units unit_hypos = self.unit_parser.parse(cell.content) if unit_hypos: for unit_hypo in unit_hypos: cell.annotations.append({ 'source': 'preprocessing', 'task': 'LiteralNormalization', 'type': 'value and unit', **unit_hypo }) continue # identify dates date_hypos = self.date_parser.parse(cell.content) if date_hypos: for date_hypo in date_hypos: cell.annotations.append({ 'source': 'preprocessing', 'task': 'LiteralNormalization', 'type': 'date', **date_hypo }) continue # identify numbers numbers = self.numeric_parser.parse(cell.content) if numbers: for number in numbers: cell.annotations.append({ 'source': 'preprocessing', 'task': 'LiteralNormalization', 'type': 'numeric', 'number': number, }) continue return True
def process_line(json_line): try: table_data = json.loads(json_line) # create Table object from 'relation' field if 'relation' in table_data and len(table_data['relation']) > 0: # create table object from table_data table = Table(table_data) # run scheduled tasks for task in tasks_scheduled: if not task.run(table): return None else: # output annotated table as json return json.dumps(table.dump()) else: return None # ignore JSON decoding errors except JSONDecodeError: return None
def main(): global num_tabel with io.open(sys.stdin.fileno(), 'r', encoding='utf-8', errors='ignore') as stdin: for json_line in stdin: try: table_data = json.loads(json_line) table = Table(table_data) t = [] col = [c.find_annotations(anno_task = 'PropertyLinking') for c in table.columns()] for row in table.rows(): if row.find_annotations(anno_source = 'preprocessing', anno_task='EntityLinking') is not []: i = 0 r = [] for cell in row: if col[i]: r = r + [[getEntity_ng(cell)]+[getLiteral_ng(cell)]] i += 1 if r: t = t + [r] if t : analyzeTable(t, col) num_tabel += 1 except JSONDecodeError: pass print("Found Tables: ", found_tables, ", Number: ", num_found, "\n") with io.open('intresting_tables.json', 'w', encoding='utf-8', errors='ignore') as stdin: json.dump(found_tables_with_rows, stdin)
def run(self, table: Table) -> None: cellset = table.cells() if 'headerRowIndex' in table.table_data: header_row_index = table.table_data['headerRowIndex'] if header_row_index != -1: cellset = cellset.where( lambda cell: cell.row_idx != header_row_index) # iterate over all cells for cell in cellset: # query the backend for mentions of the cell's content query_res = self.backend.query(cell.content) if self.fuzzy[0] and len(query_res) == 0: query_res = self.backend.fuzzy_search( cell.content, fuzzy_cutoff=self.fuzzy[1]) query_res_unique = Counter() for uri, freq in query_res: query_res_unique[uri.long()] += freq # get top <n> results (weighted by frequency of occurrence) top_n_res = sorted(query_res_unique.items(), key=itemgetter(1), reverse=True)[:self.top_n] # sum all frequencies to normalize the individual frequencies frequency_sum = sum(map(itemgetter(1), query_res)) # add annotations for each identified entity for entity in top_n_res: uri, frequency = entity normalized_frequency = frequency / frequency_sum cell.annotations.append({ 'source': 'preprocessing', 'task': 'EntityLinking', 'type': 'resource', 'resource_uri': uri, 'frequency': normalized_frequency, }) return True
gold_data_name, el_index_in_name, ll_index_in_name, el_index_out_name, ll_index_out_name ) = sys.argv[1:] # read gold data, collect entities/properties print('* reading gold annotated data from "{:s}"...'.format(gold_data_name)) gold_entities = set() gold_properties = set() with io.open(gold_data_name, 'r', encoding='utf-8', errors='ignore') as gold_data_fh: for table_json in gold_data_fh: try: table_data = json.loads(table_json) table = Table(table_data) # get EntityLinking annotations from cells for cell in table.cells(): el_annotations = cell.find_annotations( anno_source = 'gold-v2', anno_task = 'EntityLinking' ) for el_anno in el_annotations: resource_uri = URI.parse(el_anno['resource_uri'], 'dbr') gold_entities.add(resource_uri.short()) # get PropertyLinking annotations from columns for column in table.columns(): pl_annotations = column.find_annotations( anno_source = 'gold-v2',
tableNo = 0 # iterate over input. Each line represents one table for json_line in stdin: # skip irrelevant tables if relevant_tables and str(tableNo) not in relevant_tables: print('skipping table #{:d}'.format(tableNo)) tableNo += 1 continue # parse the table from the json table_data = json.loads(json_line) # create Table object to work with table = Table(table_data) # create hgp for each row for row in table.rows(): # skip irrelevant rows if relevant_tables: relevant_rows = relevant_tables[str(tableNo)] if row.row_idx not in relevant_rows: print('skipping row #{:d} in table #{:d}'.format( row.row_idx, tableNo)) continue # initialize hypothethis graph pattern (hgp) as empty list hgp = [] # Entity-Dictionnairy that indicates which entity has which blank-node. key:entity-uri, value: blankNode
total_missed = 0 missed_no_anno = 0 missed_file = 'eva_prep.missed' stats_file = 'eva_prep.stats' # read from stdin, ignore encoding errors with io.open(sys.stdin.fileno(), 'r', encoding='utf-8', errors='ignore') as stdin, \ io.open(missed_file, 'w') as missed_fh, io.open(stats_file, 'w') as stats_fh: # iterate over input. Each line represents one table for json_line in stdin: try: # parse json table_data = json.loads(json_line) # create Table object to work with table = Table(table_data) for cell in table.cells(): gold_el = cell.find_annotations(anno_source=gold_source, anno_task='EntityLinking') if gold_el: total_entities += 1 gold_el = gold_el[0] gold_uri = gold_el['resource_uri'] preprocessing_uris = [ el_anno['resource_uri'] for el_anno in cell.find_annotations(anno_source=preprocessing_source, anno_task='EntityLinking') ]
classes_reader = csv.reader(classes_fh, delimiter=',', quotechar='"') for class_row in classes_reader: table_name_ext, class_name, class_uri = class_row table_name = table_name_ext.split('.')[0] classes[table_name] = (class_name, class_uri) # iterate over all tables for table_name_ext in os.listdir(tables_dir): table_file = os.path.join(tables_dir, table_name_ext) table_name = os.path.splitext(table_name_ext)[0] # read table data & create Table object with io.open(table_file, 'r', encoding='utf-8', errors='ignore') as table_fh: table_data = json.load(table_fh) table = Table(table_data) # add class annotation if available if table_name in classes: class_name, class_uri = classes[table_name] table.annotations.append({ 'source': 'gold-v2', 'task': 'ClassLinking', 'type': 'class', 'class_name': class_name, 'class_uri': class_uri }) # add property annotations (columns) key_col_idx = None property_file = os.path.join(property_dir, table_name + '.csv')
# read from stdin, ignore encoding errors with io.open(sys.stdin.fileno(), 'r', encoding='utf-8', errors='ignore') as stdin: # read the URIs and save in list. only if the gold URI is a URI from the list, then compare to our found annotation/URI. with open('properties_to_consider.txt') as f: uris_from_file = f.readlines() uris_from_file = [uri.strip('\n') for uri in uris_from_file] # iterate over input. Each line represents one table for json_line in stdin: try: # parse json table_data = json.loads(json_line) # create Table object to work with table = Table(table_data) table_amount_columns = 0 column_same_uri = 0 column_other_uri_NOT_IN_list = 0 column_other_uri_IN_list = 0 column_has_no_gold_uri = 0 column_has_no_LL_anno = 0 column_gold_uri_not_valid = 0 print('-------------------------------------------------------------------------\n') print('TABLE BEGIN - Table '+ str(table_no) + ' total rows: '+str(table.num_rows)+ ' total cols: '+str(table.num_cols) +'\n') for column in table.columns(): print('\n COLUMN BEGIN - Col '+ str(column.col_idx))