Beispiel #1
0
def init_postgres(config, graph_name, index):
    """
        Initialize the RDF graph GRAPH_NAME with a PostgreSQL/PostgreSQL-MVCC backend, described in the configuration file CONFIG.
    """
    # install logger
    coloredlogs.install(level='INFO',
                        fmt='%(asctime)s - %(levelname)s %(message)s')
    logger = logging.getLogger(__name__)

    # load graph from config file
    graph, kind = load_graph(config,
                             graph_name,
                             logger,
                             backends=['postgres', 'postgres-mvcc'])
    enable_mvcc = kind == 'postgres-mvcc'

    # init postgre connection
    connection = connect_postgres(graph)
    if connection is None:
        exit(1)
    # turn off autocommit
    connection.autocommit = False

    # create all SQL queries used to init the graph, using the graph name
    table_name = graph['name']
    create_table_query = p_utils.get_postgres_create_table(
        table_name, enable_mvcc=enable_mvcc)
    create_indexes_queries = p_utils.get_postgres_create_indexes(
        table_name, enable_mvcc=enable_mvcc)

    cursor = connection.cursor()
    # create the main SQL table
    logger.info("Creating SQL table {}...".format(table_name))
    cursor.execute(create_table_query)
    logger.info("SPARQL table {} successfully created".format(table_name))

    # create the additional inexes on OSP and POS
    if index:
        logger.info("Creating additional B-tree indexes...")
        for q in create_indexes_queries:
            cursor.execute(q)
        logger.info("Additional B-tree indexes successfully created")
    else:
        logger.info("Skipping additional indexes creation on user-demand")

    # commit and cleanup connection
    logger.info("Committing and cleaning up...")
    connection.commit()
    cursor.close()
    connection.close()
    logger.info(
        "Sage PostgreSQL model for table {} successfully initialized".format(
            table_name))
Beispiel #2
0
def init_postgres(config, graph_name, index):
    """Initialize the RDF graph GRAPH_NAME with a PostgreSQL backend, described in the configuration file CONFIG."""
    # load graph from config file
    graph, backend = load_graph(
        config,
        graph_name,
        logger,
        backends=['postgres', 'postgres-mvcc', 'postgres-catalog'])

    # init postgre connection
    logger.info("Connecting to the PostgreSQL server...")
    connection = connect_postgres(graph)
    if connection is None:
        logger.error('Failed to establish a connection with PostgreSQL')
        exit(1)
    logger.info("Connected to the PostgreSQL server")

    # turn off autocommit
    connection.autocommit = False

    # create a cursor to interact with the database
    cursor = connection.cursor()

    # create the main SQL tables
    logger.info("Creating PostgreSQL tables...")
    create_table_queries = psql_utils.get_create_tables_queries(
        graph_name, backend)
    for query in create_table_queries:
        cursor.execute(query)
    logger.info("PostgreSQL tables successfully created")

    # create the additional indexes on OSP and POS
    if index:
        logger.info("Creating additional B-tree indexes...")
        create_indexes_queries = psql_utils.get_create_indexes_queries(
            graph_name, backend)
        for query in create_indexes_queries:
            cursor.execute(query)
        logger.info("Additional B-tree indexes successfully created")
    else:
        logger.info("Skipping additional indexes creation on user-demand")

    # commit and cleanup connection
    logger.info("Committing and cleaning up...")
    connection.commit()
    cursor.close()
    connection.close()
    logger.info(
        f"Sage PostgreSQL model for graph '{graph_name}' successfully initialized"
    )
Beispiel #3
0
def init_sqlite(config, graph_name, index):
    """Initialize the RDF graph GRAPH_NAME with a SQlite backend, described in the configuration file CONFIG."""
    # load graph from config file
    graph, backend = load_graph(config,
                                graph_name,
                                logger,
                                backends=['sqlite', 'sqlite-catalog'])

    # init SQlite connection
    logger.info("Connecting to the SQlite server...")
    connection = connect_sqlite(graph)
    connection.isolation_level = None
    if connection is None:
        logger.error('Failed to establish a connection with SQlite')
        exit(1)
    logger.info("Connected to the SQlite server")

    # create a cursor to interact with the database
    cursor = connection.cursor()

    # start a transaction
    cursor.execute("BEGIN TRANSACTION")

    # create the main SQL tables
    logger.info("Creating SQlite tables...")
    create_table_queries = sqlite_utils.get_create_tables_queries(
        graph_name, backend)
    for query in create_table_queries:
        cursor.execute(query)
    logger.info("SQlite tables successfully created")

    # create the additional indexes on OSP and POS
    if index:
        logger.info("Creating additional B-tree indexes...")
        create_indexes_queries = sqlite_utils.get_create_indexes_queries(
            graph_name, backend)
        for query in create_indexes_queries:
            cursor.execute(query)
        logger.info("Additional B-tree indexes successfully created")
    else:
        logger.info("Skipping additional indexes creation on user-demand")

    # commit and cleanup connection
    logger.info("Committing and cleaning up...")
    cursor.execute("COMMIT")
    cursor.close()
    connection.close()
    logger.info(
        f"Sage SQlite model for graph '{graph_name}' successfully initialized")
Beispiel #4
0
def index_postgres(config, graph_name):
    """Create the additional B-tree indexes on the RDF graph GRAPH_NAME, described in the configuration file CONFIG."""
    # load graph from config file
    graph, backend = load_graph(
        config,
        graph_name,
        logger,
        backends=['postgres', 'postgres-mvcc', 'postgres-catalog'])

    # init PostgreSQL connection
    logger.info("Connecting to the PostgreSQL server...")
    connection = connect_postgres(graph)
    if connection is None:
        logger.error('Failed to establish a connection with PostgreSQL')
        exit(1)
    logger.info("Connected to the PostgreSQL server")

    # turn off autocommit
    connection.autocommit = False

    # create a cursor to interact with the database
    cursor = connection.cursor()

    # create indexes
    start = time.time()
    logger.info("Creating additional B-tree indexes...")
    create_indexes_queries = psql_utils.get_create_indexes_queries(
        graph_name, backend)
    for query in create_indexes_queries:
        cursor.execute(query)
    stop = time.time()
    logger.info(
        f"Additional B-tree indexes successfully created in {stop - start}s")

    # rebuild table statistics
    logger.info("Rebuilding table statistics...")
    start = time.time()
    cursor.execute(psql_utils.get_analyze_query(graph_name))
    logger.info(
        f"Table statistics successfully rebuilt in {time.time() - start}s")

    # commit and cleanup connection
    logger.info("Committing and cleaning up...")
    connection.commit()
    cursor.close()
    connection.close()
    logger.info(
        f"Sage PostgreSQL model for graph '{graph_name}' successfully initialized"
    )
Beispiel #5
0
def index_sqlite(config, graph_name):
    """Create the additional B-tree indexes on the RDF graph GRAPH_NAME, described in the configuration file CONFIG."""
    # load graph from config file
    graph, backend = load_graph(config,
                                graph_name,
                                logger,
                                backends=['sqlite', 'sqlite-catalog'])

    # init SQlite connection
    logger.info("Connecting to the SQlite server...")
    connection = connect_sqlite(graph)
    connection.isolation_level = None
    if connection is None:
        logger.error('Failed to establish a connection with SQlite')
        exit(1)
    logger.info("Connected to the SQlite server")

    # create a cursor to interact with the database
    cursor = connection.cursor()

    # start a transaction
    cursor.execute("BEGIN TRANSACTION")

    # create indexes
    start = time.time()
    logger.info("Creating additional B-tree indexes...")
    create_indexes_queries = sqlite_utils.get_create_indexes_queries(
        graph_name, backend)
    for query in create_indexes_queries:
        cursor.execute(query)
    stop = time.time()
    logger.info(
        f"Additional B-tree indexes successfully created in {stop - start}s")

    # rebuild table statistics
    logger.info("Rebuilding table statistics...")
    start = time.time()
    cursor.execute(sqlite_utils.get_analyze_query(graph_name))
    logger.info(
        f"Table statistics successfully rebuilt in {time.time() - start}s")

    # commit and cleanup connection
    logger.info("Committing and cleaning up...")
    cursor.execute("COMMIT")
    cursor.close()
    connection.close()
    logger.info(
        f"Sage SQlite model for graph '{graph_name}' successfully initialized")
Beispiel #6
0
def index_postgres(config, graph_name):
    """
        Create the additional B-tree indexes on the RDF graph GRAPH_NAME, described in the configuration file CONFIG. The graph must use the PostgreSQL or PostgreSQL-MVCC backend.
    """
    # install logger
    coloredlogs.install(level='INFO',
                        fmt='%(asctime)s - %(levelname)s %(message)s')
    logger = logging.getLogger(__name__)

    # load graph from config file
    graph, kind = load_graph(config,
                             graph_name,
                             logger,
                             backends=['postgres', 'postgres-mvcc'])
    enable_mvcc = kind == 'postgres-mvcc'

    # init PostgreSQL connection
    connection = connect_postgres(graph)
    if connection is None:
        exit(1)
    # turn off autocommit
    connection.autocommit = False
    # create all SQL queries used to init the graph, using the graph name
    table_name = graph['name']
    create_indexes_queries = p_utils.get_postgres_create_indexes(
        table_name, enable_mvcc=enable_mvcc)

    # create indexes
    cursor = connection.cursor()
    start = time()
    logger.info("Creating additional B-tree indexes...")
    for q in create_indexes_queries:
        cursor.execute(q)
    stop = time()
    logger.info(
        "Additional B-tree indexes successfully created in {}s".format(stop -
                                                                       start))

    # commit and cleanup connection
    logger.info("Committing and cleaning up...")
    connection.commit()
    cursor.close()
    connection.close()
    logger.info(
        "Sage PostgreSQL model for table {} successfully initialized".format(
            table_name))
Beispiel #7
0
def stream_postgres(config, graph_name, rdf_file, block_size,
                    commit_threshold):
    """
        Insert RDF triples from file RDF_FILE into the RDF graph GRAPH_NAME, described in the configuration file CONFIG. The graph must use the PostgreSQL or PostgreSQL-MVCC backend.
    """

    # load graph from config file
    graph, kind = load_graph(config,
                             graph_name,
                             logger,
                             backends=['postgres', 'postgres-mvcc'])
    enable_mvcc = kind == 'postgres-mvcc'

    # init PostgreSQL connection
    logger.info("Connecting to PostgreSQL server...")
    connection = connect_postgres(graph)
    logger.info("Connected to PostgreSQL server")
    if connection is None:
        exit(1)
    # turn off autocommit
    connection.autocommit = False

    # compute SQL table name and the bulk load SQL query
    table_name = graph['name']
    insert_into_query = p_utils.get_postgres_insert_into(
        table_name, enable_mvcc=enable_mvcc)

    cursor = connection.cursor()

    logger.info("Reading NT RDF source file...(I hope)")
    #    iterator, nb_triples = get_rdf_reader(rdf_file, format=format)

    start = time()
    bucket = list()
    n = SkipParser(
        StreamSink(bucket, block_size, insert_into_query, cursor, connection,
                   commit_threshold, logger))
    with open(rdf_file, "rb") as anons:
        n.skipparse(anons)
    try:
        #print("remaining bucket:"+str(bucket))
        execute_values(cursor, insert_into_query, bucket, page_size=block_size)
    except:
        logger.error("Failed to insert:" + str(bucket))

    connection.commit()

    end = time()
    logger.info(
        "RDF triples ingestion successfully completed in {}s".format(end -
                                                                     start))

    # run an ANALYZE query to rebuild statistics
    # logger.info("Rebuilding table statistics...")
    # start = time()
    # cursor.execute("ANALYZE {}".format(table_name))
    # end = time()
    # logger.info("Table statistics successfully rebuilt in {}s".format(end - start))

    # commit and cleanup connection
    logger.info("Committing and cleaning up...")
    connection.commit()
    cursor.close()
    connection.close()
    logger.info(
        "RDF data from file '{}' successfully inserted into RDF graph '{}'".
        format(rdf_file, table_name))
    logger.info(f"Remember to run ANALYZE on Postgres table: {table_name}")
Beispiel #8
0
def put_postgres(config, graph_name, rdf_file, format, block_size,
                 commit_threshold):
    """
        Insert RDF triples from file RDF_FILE into the RDF graph GRAPH_NAME, described in the configuration file CONFIG. The graph must use the PostgreSQL or PostgreSQL-MVCC backend.
    """
    # install logger
    coloredlogs.install(level='INFO',
                        fmt='%(asctime)s - %(levelname)s %(message)s')
    logger = logging.getLogger(__name__)

    # load graph from config file
    graph, kind = load_graph(config,
                             graph_name,
                             logger,
                             backends=['postgres', 'postgres-mvcc'])
    enable_mvcc = kind == 'postgres-mvcc'

    # init PostgreSQL connection
    logger.info("Connecting to PostgreSQL server...")
    connection = connect_postgres(graph)
    logger.info("Connected to PostgreSQL server")
    if connection is None:
        exit(1)
    # turn off autocommit
    connection.autocommit = False

    # compute SQL table name and the bulk load SQL query
    table_name = graph['name']
    insert_into_query = p_utils.get_postgres_insert_into(
        table_name, enable_mvcc=enable_mvcc)

    logger.info("Reading RDF source file...")
    iterator, nb_triples = get_rdf_reader(rdf_file, format=format)
    logger.info(
        "RDF source file loaded. Found ~{} RDF triples to ingest.".format(
            nb_triples))

    logger.info("Starting RDF triples ingestion...")
    cursor = connection.cursor()

    # insert rdf triples
    start = time()
    to_commit = 0
    # insert by bucket (and show a progress bar)
    with click.progressbar(
            length=nb_triples,
            label="Inserting RDF triples".format(nb_triples)) as bar:
        for bucket in bucketify(iterator, block_size):
            to_commit += len(bucket)
            # bulk load the bucket of RDF triples, then update progress bar
            execute_values(cursor,
                           insert_into_query,
                           bucket,
                           page_size=block_size)
            bar.update(len(bucket))
            # commit if above threshold
            if to_commit >= commit_threshold:
                # logger.info("Commit threshold reached. Committing all changes...")
                connection.commit()
                # logger.info("All changes were successfully committed.")
                to_commit = 0
    end = time()
    logger.info(
        "RDF triples ingestion successfully completed in {}s".format(end -
                                                                     start))

    # run an ANALYZE query to rebuild statistics
    logger.info("Rebuilding table statistics...")
    start = time()
    cursor.execute("ANALYZE {}".format(table_name))
    end = time()
    logger.info("Table statistics successfully rebuilt in {}s".format(end -
                                                                      start))

    # commit and cleanup connection
    logger.info("Committing and cleaning up...")
    connection.commit()
    cursor.close()
    connection.close()
    logger.info(
        "RDF data from file '{}' successfully inserted into RDF graph '{}'".
        format(rdf_file, table_name))
Beispiel #9
0
def put_sqlite(config, graph_name, rdf_file, format, block_size,
               commit_threshold, cache_size):
    """Insert RDF triples from file RDF_FILE into the RDF graph GRAPH_NAME, described in the configuration file CONFIG."""
    # load graph from config file
    graph, backend = load_graph(config,
                                graph_name,
                                logger,
                                backends=['sqlite', 'sqlite-catalog'])

    # init SQlite connection
    logger.info("Connecting to the SQlite server...")
    connection = connect_sqlite(graph)
    connection.isolation_level = None
    if connection is None:
        logger.error('Failed to establish a connection with SQlite')
        exit(1)
    logger.info("Connected to the SQlite server")

    # create a cursor to interact with the database
    cursor = connection.cursor()

    # start a transaction
    cursor.execute("BEGIN TRANSACTION")

    logger.info("Reading RDF source file...")
    nb_triples = get_nb_triples(rdf_file, format)
    logger.info(f"Found ~{nb_triples} RDF triples to ingest.")

    start = time.time()
    to_commit = 0
    inserted = 0
    dropped = 0

    cache = pylru.lrucache(cache_size)

    with click.progressbar(
            length=nb_triples,
            label=
            f"Inserting RDF triples 0/{nb_triples} - {dropped} triples dropped."
    ) as bar:

        def on_bucket(bucket):
            nonlocal to_commit, inserted, dropped
            insert_bucket(cursor, bucket, graph_name, backend, block_size,
                          cache)
            to_commit = to_commit + len(bucket)
            if to_commit >= commit_threshold:
                connection.commit()
                to_commit = 0
            inserted = inserted + len(bucket)
            bar.label = f"Inserting RDF triples {inserted}/{nb_triples} - {dropped} triples dropped."
            bar.update(len(bucket))

        def on_error(error):
            nonlocal dropped, inserted
            dropped = dropped + 1
            bar.label = f"Inserting RDF triples {inserted}/{nb_triples} - {dropped} triples dropped."
            bar.update(0)

        def on_complete():
            nonlocal start
            logger.info(
                f"Triples ingestion successfully completed in {time.time() - start}s"
            )
            logger.info("Rebuilding table statistics...")
            start = time.time()
            cursor.execute(sqlite_utils.get_analyze_query(graph_name))
            logger.info(
                f"Table statistics successfully rebuilt in {time.time() - start}s"
            )
            logger.info("Committing and cleaning up...")
            cursor.execute("COMMIT")
            cursor.close()
            connection.close()
            logger.info(
                f"RDF data from file '{rdf_file}' successfully inserted into RDF graph '{graph_name}'"
            )

        logger.info("Starting RDF triples ingestion...")
        parser = ParserFactory.create_parser(format, block_size)
        parser.on_bucket = on_bucket
        parser.on_error = on_error
        parser.on_complete = on_complete
        parser.parsefile(rdf_file)