def main(): args = parse_args() satellites = dict( edges='id, node1, label, node2, data_type', quantities= 'edge_id, number, unit, low_tolerance?, high_tolerance?', # Question mark signifies nullable numeric fields strings='edge_id, text, language', dates='edge_id, date_and_time, calendar, precision', coordinates='edge_id, longitude, latitude, precision', symbols='edge_id, symbol', ) config_object = {'POSTGRES': POSTGRES} with postgres_connection(config_object) as conn: with conn.cursor() as cursor: disable_indices(cursor) import_table(cursor, args.input_dir, 'edges', satellites['edges']) for satellite, fields in satellites.items(): if satellite == 'edges': continue import_table(cursor, args.input_dir, satellite, fields) print('Committing changes') conn.commit() # Rebuild indices after committing, since in some cases Postgres complains that an object is already in use rebuild_indices(cursor) print('Done')
def delete_dataset_metadata(dataset_qnode, debug=False): with postgres_connection() as conn: with conn.cursor() as cursor: query = f"DELETE FROM edges WHERE node1='{dataset_qnode}'" if debug: print(query) cursor.execute(query)
def delete_variable(dataset_id, variable_id, property_id, debug=False): with postgres_connection() as conn: with conn.cursor() as cursor: # Everything here is running under the same transaction # Delete properties query = f""" DELETE FROM edges WHERE node1 IN ( SELECT e_main.id FROM edges AS e_main JOIN edges AS e_dataset ON (e_dataset.node1=e_main.id AND e_dataset.label='P2006020004') WHERE e_main.label='{property_id}' AND e_dataset.node2='{dataset_id}' );""" if debug: print(query) cursor.execute(query) # Now delete the main edges query = f""" DELETE FROM edges e_main WHERE id IN ( SELECT e_main.id FROM edges AS e_main JOIN edges AS e_dataset ON (e_dataset.node1=e_main.id AND e_dataset.label='P2006020004') WHERE e_main.label='{property_id}' AND e_dataset.node2='{dataset_id}' ); """ if debug: print(query) cursor.execute(query)
def delete_variable_metadata(dataset_id, variable_qnodes, debug=False): variable_qnodes_str = ', '.join( [f"'{qnode}'" for qnode in variable_qnodes]) with postgres_connection() as conn: with conn.cursor() as cursor: query = f"""DELETE FROM edges WHERE node1 IN ({variable_qnodes_str})""" if debug: print(query) cursor.execute(query)
def read_existing_edges(args): existing = set() config_object = { 'POSTGRES': POSTGRES } with postgres_connection(config_object) as conn: print(f'Reading edge IDs from the database') with conn.cursor('edges') as cursor: # Server side cursor cursor.itersize = 100000 query = "SELECT id FROM edges" cursor.execute(query) for record in cursor: existing.add(record[0]) if len(existing) % 1000000 == 0: print(f'...{len(existing):,}') print(f'Retrieved {len(existing)} ids') return existing
def run(): print('Creating the fuzzy search views') args = parse_args() config = dict(POSTGRES=POSTGRES) with postgres_connection(config) as conn: for admin, admin_pnode in ADMIN_TYPES.items(): if does_view_exists(conn, admin): if args.recreate: drop_view(conn, admin, debug=True) else: print(f'View for {admin} already exists, skipping') continue create_view(conn, admin, admin_pnode, debug=True) print('Done')
def refresh_all_views(config=None, debug=False): with postgres_connection(config) as conn: for admin in ADMIN_TYPES.keys(): refresh_view(conn, admin, debug=debug)
def import_kgtk_tsv(filename: str, config=None): def column_names(fields): for field in fields: if field[-2:] == '$?': yield field[:-2] elif field[-1] in ('$', '?'): yield field[:-1] else: yield field def object_values(obj, fields, column_names): def format_value(obj, field, column): val = getattr(obj, column, None) if val is None: if not '?' in field: raise ValueError( f"Non nullable field {column} as a null value") return 'NULL' val = str(val).replace("'", "''") if '$' in field: return f"'{val}'" return val values = [] for (idx, field) in enumerate(fields): column = column_names[idx] values.append(format_value(obj, field, column)) return "(" + ", ".join(values) + ")" def write_objects(typename, objects): # Map from object type name to ('table-name', list of fields) # A $ signifies a string value. A ? signifies a nullable value OBJECT_INFO = { 'Edge': ('edges', ['id$', 'node1$', 'label$', 'node2$', 'data_type$']), 'StringValue': ('strings', ['edge_id$', 'text$', 'language$?']), 'DateValue': ('dates', ['edge_id$', 'date_and_time$', 'precision$?', 'calendar$?']), 'QuantityValue': ('quantities', [ 'edge_id$', 'number', 'unit$?', 'low_tolerance?', 'high_tolerance?' ]), 'CoordinateValue': ('coordinates', ['edge_id$', 'latitude', 'longitude', 'precision$?']), 'SymbolValue': ('symbols', ['edge_id$', 'symbol$']), } table_name, fields = OBJECT_INFO[typename] columns = list(column_names(fields)) CHUNK_SIZE = 10000 for x in range(0, len(objects), CHUNK_SIZE): statement = f"INSERT INTO {table_name} ( {', '.join(columns)} ) VALUES\n" slice = objects[x:x + CHUNK_SIZE] values = [object_values(obj, fields, columns) for obj in slice] statement += ',\n'.join(values) statement += "\nON CONFLICT DO NOTHING;" cursor.execute(statement) def save_objects(type_name: str, objects: List[Tuple]): edges = [t[0] for t in objects] write_objects('Edge', edges) values = [t[1] for t in objects] write_objects(type_name, values) obj_map: Dict[ str, List[Tuple]] = dict() # Map from value type to list of (edge, value) start = time.time() print("Reading rows") with open(filename, "r", encoding="utf-8") as f: reader = DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: unquote_dict(row) edge, value = create_edge_objects(row) value_type = type(value).__name__ if value_type not in obj_map: obj_map[value_type] = [] obj_map[value_type].append((edge, value)) count = 0 for (type_name, objects) in obj_map.items(): count += len(objects) print(f"{type_name}\t{len(objects)}") print(f"Read {count} objects in {time.time() - start}") if count == 0: return # Time to write the edges if config and not 'POSTGRES' in config: config = dict(POSTGRES=config) with postgres_connection(config) as conn: with conn.cursor() as cursor: # Everything here runs under one transaction for (type_name, objects) in obj_map.items(): save_objects(type_name, objects) print( f"Saved {len(objects)} of {type_name} - {time.time() - start}" ) conn.commit() print(f"Done saving {count} objects in {time.time() - start}") return