Ejemplo n.º 1
0
def main():
    args = parse_args()
    satellites = dict(
        edges='id, node1, label, node2, data_type',
        quantities=
        'edge_id, number, unit, low_tolerance?, high_tolerance?',  # Question mark signifies nullable numeric fields
        strings='edge_id, text, language',
        dates='edge_id, date_and_time, calendar, precision',
        coordinates='edge_id, longitude, latitude, precision',
        symbols='edge_id, symbol',
    )
    config_object = {'POSTGRES': POSTGRES}
    with postgres_connection(config_object) as conn:
        with conn.cursor() as cursor:
            disable_indices(cursor)
            import_table(cursor, args.input_dir, 'edges', satellites['edges'])
            for satellite, fields in satellites.items():
                if satellite == 'edges':
                    continue
                import_table(cursor, args.input_dir, satellite, fields)
            print('Committing changes')
            conn.commit()

            # Rebuild indices after committing, since in some cases Postgres complains that an object is already in use
            rebuild_indices(cursor)

    print('Done')
Ejemplo n.º 2
0
def delete_dataset_metadata(dataset_qnode, debug=False):
    with postgres_connection() as conn:
        with conn.cursor() as cursor:
            query = f"DELETE FROM edges WHERE node1='{dataset_qnode}'"
            if debug:
                print(query)
            cursor.execute(query)
Ejemplo n.º 3
0
def delete_variable(dataset_id, variable_id, property_id, debug=False):
    with postgres_connection() as conn:
        with conn.cursor() as cursor:
            # Everything here is running under the same transaction

            # Delete properties
            query = f"""
            DELETE FROM edges WHERE node1 IN (
                    SELECT e_main.id
                        FROM edges AS e_main
                        JOIN edges AS e_dataset ON (e_dataset.node1=e_main.id AND e_dataset.label='P2006020004')
                    WHERE e_main.label='{property_id}' AND e_dataset.node2='{dataset_id}'
            );"""
            if debug:
                print(query)
            cursor.execute(query)

            # Now delete the main edges
            query = f"""
            DELETE FROM edges e_main WHERE id IN (
                    SELECT e_main.id
                        FROM edges AS e_main
                        JOIN edges AS e_dataset ON (e_dataset.node1=e_main.id AND e_dataset.label='P2006020004')
                    WHERE e_main.label='{property_id}' AND e_dataset.node2='{dataset_id}'
            );
            """
            if debug:
                print(query)
            cursor.execute(query)
Ejemplo n.º 4
0
def delete_variable_metadata(dataset_id, variable_qnodes, debug=False):
    variable_qnodes_str = ', '.join(
        [f"'{qnode}'" for qnode in variable_qnodes])
    with postgres_connection() as conn:
        with conn.cursor() as cursor:
            query = f"""DELETE FROM edges WHERE node1 IN ({variable_qnodes_str})"""
            if debug:
                print(query)
            cursor.execute(query)
Ejemplo n.º 5
0
def read_existing_edges(args):
    existing = set()

    config_object = { 'POSTGRES': POSTGRES }
    with postgres_connection(config_object) as conn:
        print(f'Reading edge IDs from the database')
        with conn.cursor('edges') as cursor:  # Server side cursor
            cursor.itersize = 100000
            query = "SELECT id FROM edges"
            cursor.execute(query)
            for record in cursor:
                existing.add(record[0])
                if len(existing) % 1000000 == 0:
                    print(f'...{len(existing):,}')
    print(f'Retrieved {len(existing)} ids')

    return existing
Ejemplo n.º 6
0
def run():
    print('Creating the fuzzy search views')

    args = parse_args()

    config = dict(POSTGRES=POSTGRES)
    with postgres_connection(config) as conn:
        for admin, admin_pnode in ADMIN_TYPES.items():
            if does_view_exists(conn, admin):
                if args.recreate:
                    drop_view(conn, admin, debug=True)
                else:
                    print(f'View for {admin} already exists, skipping')
                    continue
            create_view(conn, admin, admin_pnode, debug=True)

    print('Done')
Ejemplo n.º 7
0
def refresh_all_views(config=None, debug=False):
    with postgres_connection(config) as conn:
        for admin in ADMIN_TYPES.keys():
            refresh_view(conn, admin, debug=debug)
Ejemplo n.º 8
0
def import_kgtk_tsv(filename: str, config=None):
    def column_names(fields):
        for field in fields:
            if field[-2:] == '$?':
                yield field[:-2]
            elif field[-1] in ('$', '?'):
                yield field[:-1]
            else:
                yield field

    def object_values(obj, fields, column_names):
        def format_value(obj, field, column):
            val = getattr(obj, column, None)
            if val is None:
                if not '?' in field:
                    raise ValueError(
                        f"Non nullable field {column} as a null value")
                return 'NULL'
            val = str(val).replace("'", "''")
            if '$' in field:
                return f"'{val}'"
            return val

        values = []
        for (idx, field) in enumerate(fields):
            column = column_names[idx]
            values.append(format_value(obj, field, column))
        return "(" + ", ".join(values) + ")"

    def write_objects(typename, objects):
        # Map from object type name to ('table-name', list of fields)
        # A $ signifies a string value. A ? signifies a nullable value
        OBJECT_INFO = {
            'Edge':
            ('edges', ['id$', 'node1$', 'label$', 'node2$', 'data_type$']),
            'StringValue': ('strings', ['edge_id$', 'text$', 'language$?']),
            'DateValue':
            ('dates',
             ['edge_id$', 'date_and_time$', 'precision$?', 'calendar$?']),
            'QuantityValue': ('quantities', [
                'edge_id$', 'number', 'unit$?', 'low_tolerance?',
                'high_tolerance?'
            ]),
            'CoordinateValue':
            ('coordinates',
             ['edge_id$', 'latitude', 'longitude', 'precision$?']),
            'SymbolValue': ('symbols', ['edge_id$', 'symbol$']),
        }

        table_name, fields = OBJECT_INFO[typename]
        columns = list(column_names(fields))

        CHUNK_SIZE = 10000
        for x in range(0, len(objects), CHUNK_SIZE):
            statement = f"INSERT INTO {table_name} ( {', '.join(columns)} ) VALUES\n"
            slice = objects[x:x + CHUNK_SIZE]
            values = [object_values(obj, fields, columns) for obj in slice]
            statement += ',\n'.join(values)
            statement += "\nON CONFLICT DO NOTHING;"
            cursor.execute(statement)

    def save_objects(type_name: str, objects: List[Tuple]):
        edges = [t[0] for t in objects]
        write_objects('Edge', edges)
        values = [t[1] for t in objects]
        write_objects(type_name, values)

    obj_map: Dict[
        str,
        List[Tuple]] = dict()  # Map from value type to list of (edge, value)
    start = time.time()
    print("Reading rows")
    with open(filename, "r", encoding="utf-8") as f:
        reader = DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            unquote_dict(row)
            edge, value = create_edge_objects(row)
            value_type = type(value).__name__
            if value_type not in obj_map:
                obj_map[value_type] = []
            obj_map[value_type].append((edge, value))

    count = 0
    for (type_name, objects) in obj_map.items():
        count += len(objects)
        print(f"{type_name}\t{len(objects)}")
    print(f"Read {count} objects in {time.time() - start}")

    if count == 0:
        return

    # Time to write the edges
    if config and not 'POSTGRES' in config:
        config = dict(POSTGRES=config)

    with postgres_connection(config) as conn:
        with conn.cursor() as cursor:
            # Everything here runs under one transaction
            for (type_name, objects) in obj_map.items():
                save_objects(type_name, objects)
                print(
                    f"Saved {len(objects)} of {type_name} - {time.time() - start}"
                )
        conn.commit()

    print(f"Done saving {count} objects in {time.time() - start}")

    return