Ejemplo n.º 1
0
            nodes[name] = node
        else:
            nodes[name].update(node)
    #closing the current db
    db.close()
    #logging out some info
    current_file = PIECE_LIST.index(filename)
    sys.stdout.write(
        "Building the node dictionary: Processing %d files out of %d\r" %
        (current_file, sum_files))

#making a memory database and inserting the unique nodes from the nodes dictionary
print('Inserting nodes to database')
parser = PsimiSQL(SQL_SEED)
for node in nodes:
    parser.insert_unique_node(nodes[node])

#now that we have the nodes in the final db, the edges can be inserted
#there is no need for a edges dictionary, because reading it from the files costs less memory
#iterating through the .db piece files again
print('Inserting edges to database')
for filename in PIECE_LIST:
    db = sqlite3.connect(filename)
    query = "SELECT * FROM edge"
    cursor = db.cursor()
    cursor.execute(query)

    #iterating trough the current piece .db files
    while True:
        edge_row = cursor.fetchone()
        if edge_row == None:
def main():
    # declaring the dicts that will hold the data
    nodes = {}
    collected_edges = {}

    merged_edge_counter = 0
    not_merged_edge = 0

    # the number of pieces (.db files)
    sum_files = len(SOURCE_DB_FILE_LIST)

    # filling up the nodes dictionary with the data contained in db piece files
    for db_file in SOURCE_DB_FILE_LIST:

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute("SELECT * FROM node WHERE tax_id = 'taxid:9606' OR tax_id = 'taxid:99284'")

        # iterating trough the db row by row
        while True:
            row = cursor.fetchone()
            # until the last row
            if row == None:
                break
            # if unique, inserting the node (row) to the nodes dictionary
            id, name, alt_accession, tax_id, pathways, aliases, topology = row
            node = {
                "name": name,
                "alt_accession": alt_accession,
                "tax_id": tax_id,
                "pathways": pathways,
                "aliases": aliases,
                "topology": topology,
            }
            if not nodes.has_key(name):
                nodes[name] = node
            else:
                nodes[name] = get_union_of_nodes(nodes[name], node)
        # closing the current db
        db.close()
        # logging out some info
        current_file = SOURCE_DB_FILE_LIST.index(db_file)
        sys.stdout.write("Building the node dictionary: Processing %d file out of %d\r" % (current_file, sum_files))

    # making a memory database and inserting the unique nodes from the nodes dictionary
    print("Inserting nodes to database")
    parser = PsimiSQL()
    for node in nodes:
        parser.insert_unique_node(nodes[node])
        nodes[node]["id"] = parser.cursor.lastrowid

    # looping through the files again to make an edge list
    print("Started building edge dict")
    file_counter = 1
    for db_file in SOURCE_DB_FILE_LIST:

        sys.stdout.write("Inserting edges to edge dict from '%s' (%d/%d)\r" % (db_file, file_counter, sum_files))

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute("SELECT * FROM edge")

        while True:
            row = cursor.fetchone()

            # if there aren't any more nodes break out of the loop
            if not row:
                break
            else:
                # deconstructing the row (list)
                edge_row_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = (
                    row
                )

                # because in the nodes dict building process the query only asks for human and salmonella nodes
                # we have to make sure that we don't try to insert edges whose
                # nodes are in the nodes dict (=does not a node of other organism)
                if nodes.has_key(interactor_a_node_name) and nodes.has_key(interactor_b_node_name):

                    # generating an edge id that will be the key in the edge dict
                    edge_id = interactor_a_node_name + "@" + interactor_b_node_name

                    # generating an edge dict, that will be a value for the key in the collected_edges dict
                    current_edge = {
                        "interaction_detection_method": interaction_detection_method,
                        "first_author": first_author,
                        "publication_ids": publication_ids,
                        "interaction_types": interaction_types,
                        "source_db": source_db,
                        "interaction_identifiers": interaction_identifiers,
                        "confidence_scores": confidence_scores,
                        "layer": layer,
                    }

                    # if the collected_edges dict does not contain
                    # this edge_id. the edge is stored in the collected edges
                    if not collected_edges.has_key(edge_id):
                        collected_edges[edge_id] = current_edge
                    else:
                        # if collected_edges has this id the edge will be merged
                        collected_edge = collected_edges[edge_id]

                        # if an edge is already in the dict it will be merged with the current edge
                        collected_edge["interaction_types"] = merge_strings(
                            collected_edge["interaction_types"], current_edge["interaction_types"]
                        )
                        collected_edge["first_author"] = merge_strings(
                            collected_edge["first_author"], current_edge["first_author"]
                        )
                        collected_edge["source_db"] = merge_strings(
                            collected_edge["source_db"], current_edge["source_db"]
                        )
                        collected_edge["interaction_identifiers"] = merge_strings(
                            collected_edge["interaction_identifiers"], current_edge["interaction_identifiers"]
                        )
                        collected_edge["interaction_detection_method"] = merge_strings(
                            collected_edge["interaction_detection_method"], current_edge["interaction_detection_method"]
                        )
                        collected_edge["confidence_scores"] = merge_strings(
                            collected_edge["confidence_scores"], current_edge["confidence_scores"]
                        )

    print("Building edge dict done!")
    print("Started inserting edges to the db")
    # iterating through edges dictionary and inserting nodes to the SQLite db
    for collected_edge_id, edge_to_insert in collected_edges.iteritems():
        # getting the nodes
        node_a, node_b = collected_edge_id.split("@")

        node_a_dict = nodes[node_a]
        node_b_dict = nodes[node_b]

        parser.insert_edge(node_a_dict, node_b_dict, edge_to_insert)

    print("Saving db")
    parser.save_db_to_file(DESTINATION)
Ejemplo n.º 3
0
def main():
    # declaring the dicts that will hold the data
    nodes = {}
    collected_edges = {}

    merged_edge_counter = 0
    not_merged_edge = 0

    # the number of pieces (.db files)
    sum_files = len(SOURCE_DB_FILE_LIST)

    # filling up the nodes dictionary with the data contained in db piece files
    for db_file in SOURCE_DB_FILE_LIST:

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute(
            "SELECT * FROM node WHERE tax_id = 'taxid:9606' OR tax_id = 'taxid:99284'"
        )

        # iterating trough the db row by row
        while True:
            row = cursor.fetchone()
            # until the last row
            if row == None:
                break
            # if unique, inserting the node (row) to the nodes dictionary
            id, name, alt_accession, tax_id, pathways, aliases, topology = row
            node = {
                "name": name,
                'alt_accession': alt_accession,
                'tax_id': tax_id,
                'pathways': pathways,
                'aliases': aliases,
                'topology': topology
            }
            if not nodes.has_key(name):
                nodes[name] = node
            else:
                nodes[name] = get_union_of_nodes(nodes[name], node)
        # closing the current db
        db.close()
        # logging out some info
        current_file = SOURCE_DB_FILE_LIST.index(db_file)
        sys.stdout.write(
            "Building the node dictionary: Processing %d file out of %d\r" %
            (current_file, sum_files))

    # making a memory database and inserting the unique nodes from the nodes dictionary
    print('Inserting nodes to database')
    parser = PsimiSQL()
    for node in nodes:
        parser.insert_unique_node(nodes[node])
        nodes[node]['id'] = parser.cursor.lastrowid

    # looping through the files again to make an edge list
    print("Started building edge dict")
    file_counter = 1
    for db_file in SOURCE_DB_FILE_LIST:

        sys.stdout.write("Inserting edges to edge dict from '%s' (%d/%d)\r" %
                         (db_file, file_counter, sum_files))

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute("SELECT * FROM edge")

        while True:
            row = cursor.fetchone()

            # if there aren't any more nodes break out of the loop
            if not row:
                break
            else:
                # deconstructing the row (list)
                edge_row_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row

                # because in the nodes dict building process the query only asks for human and salmonella nodes
                # we have to make sure that we don't try to insert edges whose
                # nodes are in the nodes dict (=does not a node of other organism)
                if nodes.has_key(interactor_a_node_name) and nodes.has_key(
                        interactor_b_node_name):

                    # generating an edge id that will be the key in the edge dict
                    edge_id = interactor_a_node_name + "@" + interactor_b_node_name

                    # generating an edge dict, that will be a value for the key in the collected_edges dict
                    current_edge = {
                        'interaction_detection_method':
                        interaction_detection_method,
                        'first_author': first_author,
                        'publication_ids': publication_ids,
                        'interaction_types': interaction_types,
                        'source_db': source_db,
                        'interaction_identifiers': interaction_identifiers,
                        'confidence_scores': confidence_scores,
                        'layer': layer
                    }

                    # if the collected_edges dict does not contain
                    # this edge_id. the edge is stored in the collected edges
                    if not collected_edges.has_key(edge_id):
                        collected_edges[edge_id] = current_edge
                    else:
                        # if collected_edges has this id the edge will be merged
                        collected_edge = collected_edges[edge_id]

                        # if an edge is already in the dict it will be merged with the current edge
                        collected_edge['interaction_types'] = merge_strings(
                            collected_edge['interaction_types'],
                            current_edge['interaction_types'])
                        collected_edge['first_author'] = merge_strings(
                            collected_edge['first_author'],
                            current_edge['first_author'])
                        collected_edge['source_db'] = merge_strings(
                            collected_edge['source_db'],
                            current_edge['source_db'])
                        collected_edge[
                            'interaction_identifiers'] = merge_strings(
                                collected_edge['interaction_identifiers'],
                                current_edge['interaction_identifiers'])
                        collected_edge[
                            'interaction_detection_method'] = merge_strings(
                                collected_edge['interaction_detection_method'],
                                current_edge['interaction_detection_method'])
                        collected_edge['confidence_scores'] = merge_strings(
                            collected_edge['confidence_scores'],
                            current_edge['confidence_scores'])

    print("Building edge dict done!")
    print("Started inserting edges to the db")
    # iterating through edges dictionary and inserting nodes to the SQLite db
    for collected_edge_id, edge_to_insert in collected_edges.iteritems():
        # getting the nodes
        node_a, node_b = collected_edge_id.split('@')

        node_a_dict = nodes[node_a]
        node_b_dict = nodes[node_b]

        parser.insert_edge(node_a_dict, node_b_dict, edge_to_insert)

    print("Saving db")
    parser.save_db_to_file(DESTINATION)