help="The location of the PsimiSQL class")
parser.add_argument('--map-file', required=True, metavar="MapFile", type=str,
                    help="The location of the file that maps salmonella proteins to uniprot")
parser.add_argument('--dictionary-db', required=True, metavar="Dictionary-db", type=str)
parser.add_argument('--prediction-name', required=True, metavar="PredName", type=str)

args = parser.parse_args()
dictionary_api = sqlite3.connect(args.dictionary_db)
dictionary_cursor = dictionary_api.cursor()


sys.path.append(args.psimisql)
from sqlite_db_api import PsimiSQL

# initiating the memory db
db_api = PsimiSQL()

# the unique nodes will be held here
salmonella_nodes= {}
human_nodes = {}
edges = []

# parsing the map file
with open(args.map_file) as mapFile:

    # skipping header
    mapFile.readline()

    for line in mapFile:
        uniprot, salmonella = line.strip().split("\t")
Example #2
0
def main():

    # opening the old_db for mapping
    old_db = PsimiSQL()
    old_db.import_from_db_file(SOURCE_DB_LOCATION)

    # making the script more verbose
    counter = 0
    old_db.cursor.execute("SELECT count(*) FROM node")
    number_of_nodes = old_db.cursor.fetchone()[0]

    # iterating through the old_db's nodes
    old_db.cursor.execute("SELECT * FROM node")

    # mapping old node_ids to new node old ids
    old_node_ids_dict = {}

    # initiating an empty db the maped nodes are put
    new_db = PsimiSQL()

    # declaring a counter to count the nodes that does not match
    no_match_counter = 0
    invalid_node_counter = 0

    # looping through the old_db_s nodes
    while True:
        row = old_db.cursor.fetchone()

        # communicating with user
        sys.stdout.write("Querying %d. node from dictionary out of %d\r" %
                         (counter, number_of_nodes))
        counter += 1

        # until the last row
        if row == None:
            break
        else:
            row_id, mitab_name, alt_accession, mitab_tax_id, pathways, aliases, topology = row

            tax_id = str(mitab_tax_id.split(':')[1])
            name = str(mitab_name.split(':')[1])

            old_node_dict = {
                "id": row_id,
                "name": mitab_name,
                "alt_accession": alt_accession,
                "tax_id": mitab_tax_id,
                "pathways": pathways,
                "aliases": aliases,
                "topology": topology
            }

            # if the fetched node is already mapped, just it's copy will be inserted
            #  if "uniprot" in mitab_name:
            #      add_uniprot(old_node_dict,old_node_ids_dict,new_db)
            #  else:

            query = """
                SELECT DISTINCT foreign_ids.accession, uniprot.accession, uniprot.is_swissprot, uniprot.is_primary
                FROM foreign_ids JOIN uniprot ON foreign_ids.uniprot_id = uniprot.id
                WHERE foreign_ids.accession = ? AND uniprot.tax_id = ? AND uniprot.is_primary = 1
            """

            tup = (name, tax_id)

            DICTIONARY_DB_CURSOR.execute(query, tup)
            DICTIONARY_DB.commit()

            result = DICTIONARY_DB_CURSOR.fetchall()

            if len(result) == 0:
                # if there is no match in the map for the current node
                no_match_counter += 1
            else:
                # get a list with only the swissprot nodes from the result of the SQL query
                swiss_nodes = get_swiss_arr(result)

                # getting the trembl nodes arr
                trembl_nodes = get_trembl_arr(result)

                # getting the new aliases
                aliases = get_aliases_string(trembl_nodes)

                # best case scenario it's a 1 -> 1 map
                if len(swiss_nodes) == 1:
                    swiss_accession = "uniprot:" + swiss_nodes[0][1]
                    add_node(old_node_dict, old_node_ids_dict, swiss_accession,
                             new_db, aliases)
                # if it maps to more than one swissprot accession, all swissprot nodes will be added
                elif len(swiss_nodes) > 1:
                    for node in swiss_nodes:
                        swiss_accession = "uniprot:" + node[1]
                        add_node(old_node_dict, old_node_ids_dict,
                                 swiss_accession, new_db, aliases)
                # adding trembl nodes if the old node does not match any swissprot accession
                else:
                    for node in trembl_nodes:
                        trembl_accession = "trembl:" + node[1]
                        add_node(old_node_dict, old_node_ids_dict,
                                 trembl_accession, new_db, aliases)

    print("Inserting to %s nodes done" % SOURCE_DB_TYPE)

    # setting up counters, to be able to give the user some information of the ongoing process
    old_db.cursor.execute("SELECT count(*) FROM edge")
    number_of_edges = old_db.cursor.fetchone()[0]
    edge_counter = 0

    query = "SELECT * from edge"
    old_db.cursor.execute(query)

    while True:
        # informing the user
        sys.stdout.write("Parsing edge # %d out of %d\r" %
                         (edge_counter, number_of_edges))
        row = old_db.cursor.fetchone()

        if row == None:
            break
        else:
            edge_counter += 1

            # deconstructing the row (list)
            edge_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row

            # since we get the old interactor id's from this query we can simply look up ther new id(s) in the old_node_ids dict
            # it both nodes mapped we add them as an edge to the new db
            if old_node_ids_dict.has_key(
                    old_interactor_a_node_id) and old_node_ids_dict.has_key(
                        old_interactor_b_node_id):

                # looping through every new 'A' node
                for new_node_id_a in old_node_ids_dict[
                        old_interactor_a_node_id]:

                    new_node_a_dict = new_db.get_node_by_id(new_node_id_a)

                    # looping through every new 'B' node for every new 'A' node and inserting them as an edge
                    for new_node_id_b in old_node_ids_dict[
                            old_interactor_b_node_id]:

                        new_node_b_dict = new_db.get_node_by_id(new_node_id_b)

                        # generating the new edge dict
                        new_edge_dict = {
                            'interactor_a_node_id': new_node_id_a,
                            'interactor_b_node_id': new_node_id_b,
                            'interactor_a_node_name': interactor_a_node_name,
                            'interactor_b_node_name': interactor_b_node_name,
                            'interaction_detection_method':
                            interaction_detection_method,
                            'first_author': first_author,
                            'publication_ids': publication_ids,
                            'source_db': "source database:" + SOURCE_DB_TYPE,
                            'interaction_types': interaction_types,
                            'interaction_identifiers': interaction_identifiers,
                            'confidence_scores': confidence_scores,
                            'layer': layer
                        }

                        # inserting the new node
                        new_db.insert_edge(new_node_a_dict, new_node_b_dict,
                                           new_edge_dict)
            else:
                # countering the nodes that can't be inserted to the new db because one of their nodes haven't mapped
                invalid_node_counter += 1
    print("Inserting edges to %s.db finished!" % SOURCE_DB_TYPE)

    new_db.save_db_to_file(DESTINATION_DB_LOCATION)
def main():
    # declaring the dicts that will hold the data
    nodes = {}
    collected_edges = {}

    merged_edge_counter = 0
    not_merged_edge = 0

    # the number of pieces (.db files)
    sum_files = len(SOURCE_DB_FILE_LIST)

    # filling up the nodes dictionary with the data contained in db piece files
    for db_file in SOURCE_DB_FILE_LIST:

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute("SELECT * FROM node WHERE tax_id = 'taxid:9606' OR tax_id = 'taxid:99284'")

        # iterating trough the db row by row
        while True:
            row = cursor.fetchone()
            # until the last row
            if row == None:
                break
            # if unique, inserting the node (row) to the nodes dictionary
            id, name, alt_accession, tax_id, pathways, aliases, topology = row
            node = {
                "name": name,
                "alt_accession": alt_accession,
                "tax_id": tax_id,
                "pathways": pathways,
                "aliases": aliases,
                "topology": topology,
            }
            if not nodes.has_key(name):
                nodes[name] = node
            else:
                nodes[name] = get_union_of_nodes(nodes[name], node)
        # closing the current db
        db.close()
        # logging out some info
        current_file = SOURCE_DB_FILE_LIST.index(db_file)
        sys.stdout.write("Building the node dictionary: Processing %d file out of %d\r" % (current_file, sum_files))

    # making a memory database and inserting the unique nodes from the nodes dictionary
    print("Inserting nodes to database")
    parser = PsimiSQL()
    for node in nodes:
        parser.insert_unique_node(nodes[node])
        nodes[node]["id"] = parser.cursor.lastrowid

    # looping through the files again to make an edge list
    print("Started building edge dict")
    file_counter = 1
    for db_file in SOURCE_DB_FILE_LIST:

        sys.stdout.write("Inserting edges to edge dict from '%s' (%d/%d)\r" % (db_file, file_counter, sum_files))

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute("SELECT * FROM edge")

        while True:
            row = cursor.fetchone()

            # if there aren't any more nodes break out of the loop
            if not row:
                break
            else:
                # deconstructing the row (list)
                edge_row_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = (
                    row
                )

                # because in the nodes dict building process the query only asks for human and salmonella nodes
                # we have to make sure that we don't try to insert edges whose
                # nodes are in the nodes dict (=does not a node of other organism)
                if nodes.has_key(interactor_a_node_name) and nodes.has_key(interactor_b_node_name):

                    # generating an edge id that will be the key in the edge dict
                    edge_id = interactor_a_node_name + "@" + interactor_b_node_name

                    # generating an edge dict, that will be a value for the key in the collected_edges dict
                    current_edge = {
                        "interaction_detection_method": interaction_detection_method,
                        "first_author": first_author,
                        "publication_ids": publication_ids,
                        "interaction_types": interaction_types,
                        "source_db": source_db,
                        "interaction_identifiers": interaction_identifiers,
                        "confidence_scores": confidence_scores,
                        "layer": layer,
                    }

                    # if the collected_edges dict does not contain
                    # this edge_id. the edge is stored in the collected edges
                    if not collected_edges.has_key(edge_id):
                        collected_edges[edge_id] = current_edge
                    else:
                        # if collected_edges has this id the edge will be merged
                        collected_edge = collected_edges[edge_id]

                        # if an edge is already in the dict it will be merged with the current edge
                        collected_edge["interaction_types"] = merge_strings(
                            collected_edge["interaction_types"], current_edge["interaction_types"]
                        )
                        collected_edge["first_author"] = merge_strings(
                            collected_edge["first_author"], current_edge["first_author"]
                        )
                        collected_edge["source_db"] = merge_strings(
                            collected_edge["source_db"], current_edge["source_db"]
                        )
                        collected_edge["interaction_identifiers"] = merge_strings(
                            collected_edge["interaction_identifiers"], current_edge["interaction_identifiers"]
                        )
                        collected_edge["interaction_detection_method"] = merge_strings(
                            collected_edge["interaction_detection_method"], current_edge["interaction_detection_method"]
                        )
                        collected_edge["confidence_scores"] = merge_strings(
                            collected_edge["confidence_scores"], current_edge["confidence_scores"]
                        )

    print("Building edge dict done!")
    print("Started inserting edges to the db")
    # iterating through edges dictionary and inserting nodes to the SQLite db
    for collected_edge_id, edge_to_insert in collected_edges.iteritems():
        # getting the nodes
        node_a, node_b = collected_edge_id.split("@")

        node_a_dict = nodes[node_a]
        node_b_dict = nodes[node_b]

        parser.insert_edge(node_a_dict, node_b_dict, edge_to_insert)

    print("Saving db")
    parser.save_db_to_file(DESTINATION)
def main():
    # parsing the arguments
    arguments = parse_arguments()

    # importing the PsimiSQL clas
    sys.path.append(arguments.sqlite_db_api)
    from sqlite_db_api import PsimiSQL

    # the nodes and the edges will be stored in a dict
    nodes = {}
    edges = {}

    # Parsing the nodes first and merging pathways and storing this in the nodes dict created above
    # (querying node to check whether it has the same pathway and than updating it with sql queries would be slow)
    with open(arguments.source_file) as source_file:
        # skipping the header line if needed
        if arguments.skip_header:
            source_file.readline()

        # setting up variables for informing the user
        num_lines = float(sum([1 for line in source_file]))
        line_counter = float(0)
        source_file.seek(0)

        # looping through the file
        for line in source_file:
            # infroming the user
            line_counter += 1
            if line_counter % 50 == 0:
                done = (line_counter / num_lines) * 100
                sys.stdout.write("Parsing mitab file (%d%%)\r" % (done))

            # deconstructing the line
            source_acc, target_acc, source_alt_acc, target_alt_acc, source_alias, target_alias, int_det_method, author, pubmed_ids, source_tax_id, target_tax_id, int_type, source_db, confidence, pathway_ids, layer, source_topology, target_topology = line.strip(
            ).split("\t")

            source_dict = {
                "name": source_acc,
                "alt_accession": source_alt_acc,
                "tax_id": source_tax_id,
                "pathways": pathway_ids,
                "aliases": source_alias,
                "topology": source_topology
            }

            add_to_nodes(source_dict, nodes)

            target_dict = {
                "name": target_acc,
                "alt_accession": target_alt_acc,
                "tax_id": target_tax_id,
                "pathways": pathway_ids,
                "aliases": target_alias,
                "topology": target_topology
            }

            add_to_nodes(target_dict, nodes)

            # adding the edge to the edges dict
            edges["%s@%s" % (source_acc, target_acc)] = {
                'interaction_detection_method': int_det_method,
                'first_author': author,
                'publication_ids': pubmed_ids,
                'interaction_types': int_type,
                'source_db': source_db,
                'interaction_identifiers': '-',
                'confidence_scores': confidence,
                'layer': layer
            }

    # informing the user
    print("Parsing MiTAB file: Finished")

    # now that we have the unique nodes we can add them to the Psi-Mi-SQL database

    # initiating the memory Mitab database
    db_api = PsimiSQL()

    num_nodes = float(len(nodes))
    line_counter = float(1)

    # inserting the nodes to the memory db
    for node_name, node_dict in nodes.items():

        #informing the user
        if line_counter % 50 == 0:
            done = float((line_counter / num_nodes) * 100)
            sys.stdout.write("Inserting nodes to NetMiTabSQL (%s%%)" % (done))
        line_counter += 1

        # inserting node to the db file
        db_api.insert_node(node_dict)

        # updating (mutating) the node dict with the SQL row id in the nodes dictionary so it can be used later
        # (again, it is faster to store the row ids fot the rows than querying each rowid)
        nodes["id"] = db_api.last_row_id

    print("Inserting nodes to NetMiTabSQL: Done")

    num_edges = float(len(nodes))
    line_counter = float(1)

    # inserting the edges to the memory db
    for edge_id, edge_dict in edges.items():

        #informing the user
        if line_counter % 50 == 0:
            done = float((line_counter / num_edges) * 100)
            sys.stdout.write("Inserting nodes to NetMiTabSQL (%s%%)" % (done))
        line_counter += 1

        source_name, target_name = edge_id.split('@')

        source_dict = nodes[source_name]
        target_dict = nodes[target_name]

        db_api.insert_edge(source_dict, target_dict, edge_dict)

    print("Inserting edges to NetMiTabSQL: Done")

    print("Saving the database to filesystem")
    # the database is finished, saving
    db_api.save_db_to_file(arguments.output_file)

    print("Database saved")
Example #5
0
        }
        if not nodes.has_key(name):
            nodes[name] = node
        else:
            nodes[name].update(node)
    #closing the current db
    db.close()
    #logging out some info
    current_file = PIECE_LIST.index(filename)
    sys.stdout.write(
        "Building the node dictionary: Processing %d files out of %d\r" %
        (current_file, sum_files))

#making a memory database and inserting the unique nodes from the nodes dictionary
print('Inserting nodes to database')
parser = PsimiSQL(SQL_SEED)
for node in nodes:
    parser.insert_unique_node(nodes[node])

#now that we have the nodes in the final db, the edges can be inserted
#there is no need for a edges dictionary, because reading it from the files costs less memory
#iterating through the .db piece files again
print('Inserting edges to database')
for filename in PIECE_LIST:
    db = sqlite3.connect(filename)
    query = "SELECT * FROM edge"
    cursor = db.cursor()
    cursor.execute(query)

    #iterating trough the current piece .db files
    while True:
    help="The name and optionally location where the data should be saved.",
)
parser.add_argument(
    "--psimisql", required=True, metavar="PsimiSQLLocation", type=str, help="The location of the PsimiSQL class"
)

args = parser.parse_args()

# imports
import sys

sys.path.append(args.psimisql)
from sqlite_db_api import PsimiSQL

# initiating the memory db
db_api = PsimiSQL()

# the unique nodes will be held here
nodes = {}
edges = {}


# parsing the file
with open(args.source_file) as source_file:

    # read the first line

    source_file.readline()

    for line in source_file:
Example #7
0
parser.add_argument('--psimisql',
                    required=True,
                    metavar="PsimiSQLLocation",
                    type=str,
                    help="The location of the PsimiSQL class")

args = parser.parse_args()

# imports
import sys

sys.path.append(args.psimisql)
from sqlite_db_api import PsimiSQL

# initiating the memory db
db_api = PsimiSQL()

# the unique nodes will be held here
nodes = {}

# parsing the file

with open(args.source_file) as source_file:

    # skipping the header line

    source_file.readline()

    for line in source_file:

        linearr = line.strip().split("\t")
Example #8
0
def main():
    # declaring the dicts that will hold the data
    nodes = {}
    collected_edges = {}

    merged_edge_counter = 0
    not_merged_edge = 0

    # the number of pieces (.db files)
    sum_files = len(SOURCE_DB_FILE_LIST)

    # filling up the nodes dictionary with the data contained in db piece files
    for db_file in SOURCE_DB_FILE_LIST:

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute(
            "SELECT * FROM node WHERE tax_id = 'taxid:9606' OR tax_id = 'taxid:99284'"
        )

        # iterating trough the db row by row
        while True:
            row = cursor.fetchone()
            # until the last row
            if row == None:
                break
            # if unique, inserting the node (row) to the nodes dictionary
            id, name, alt_accession, tax_id, pathways, aliases, topology = row
            node = {
                "name": name,
                'alt_accession': alt_accession,
                'tax_id': tax_id,
                'pathways': pathways,
                'aliases': aliases,
                'topology': topology
            }
            if not nodes.has_key(name):
                nodes[name] = node
            else:
                nodes[name] = get_union_of_nodes(nodes[name], node)
        # closing the current db
        db.close()
        # logging out some info
        current_file = SOURCE_DB_FILE_LIST.index(db_file)
        sys.stdout.write(
            "Building the node dictionary: Processing %d file out of %d\r" %
            (current_file, sum_files))

    # making a memory database and inserting the unique nodes from the nodes dictionary
    print('Inserting nodes to database')
    parser = PsimiSQL()
    for node in nodes:
        parser.insert_unique_node(nodes[node])
        nodes[node]['id'] = parser.cursor.lastrowid

    # looping through the files again to make an edge list
    print("Started building edge dict")
    file_counter = 1
    for db_file in SOURCE_DB_FILE_LIST:

        sys.stdout.write("Inserting edges to edge dict from '%s' (%d/%d)\r" %
                         (db_file, file_counter, sum_files))

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute("SELECT * FROM edge")

        while True:
            row = cursor.fetchone()

            # if there aren't any more nodes break out of the loop
            if not row:
                break
            else:
                # deconstructing the row (list)
                edge_row_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row

                # because in the nodes dict building process the query only asks for human and salmonella nodes
                # we have to make sure that we don't try to insert edges whose
                # nodes are in the nodes dict (=does not a node of other organism)
                if nodes.has_key(interactor_a_node_name) and nodes.has_key(
                        interactor_b_node_name):

                    # generating an edge id that will be the key in the edge dict
                    edge_id = interactor_a_node_name + "@" + interactor_b_node_name

                    # generating an edge dict, that will be a value for the key in the collected_edges dict
                    current_edge = {
                        'interaction_detection_method':
                        interaction_detection_method,
                        'first_author': first_author,
                        'publication_ids': publication_ids,
                        'interaction_types': interaction_types,
                        'source_db': source_db,
                        'interaction_identifiers': interaction_identifiers,
                        'confidence_scores': confidence_scores,
                        'layer': layer
                    }

                    # if the collected_edges dict does not contain
                    # this edge_id. the edge is stored in the collected edges
                    if not collected_edges.has_key(edge_id):
                        collected_edges[edge_id] = current_edge
                    else:
                        # if collected_edges has this id the edge will be merged
                        collected_edge = collected_edges[edge_id]

                        # if an edge is already in the dict it will be merged with the current edge
                        collected_edge['interaction_types'] = merge_strings(
                            collected_edge['interaction_types'],
                            current_edge['interaction_types'])
                        collected_edge['first_author'] = merge_strings(
                            collected_edge['first_author'],
                            current_edge['first_author'])
                        collected_edge['source_db'] = merge_strings(
                            collected_edge['source_db'],
                            current_edge['source_db'])
                        collected_edge[
                            'interaction_identifiers'] = merge_strings(
                                collected_edge['interaction_identifiers'],
                                current_edge['interaction_identifiers'])
                        collected_edge[
                            'interaction_detection_method'] = merge_strings(
                                collected_edge['interaction_detection_method'],
                                current_edge['interaction_detection_method'])
                        collected_edge['confidence_scores'] = merge_strings(
                            collected_edge['confidence_scores'],
                            current_edge['confidence_scores'])

    print("Building edge dict done!")
    print("Started inserting edges to the db")
    # iterating through edges dictionary and inserting nodes to the SQLite db
    for collected_edge_id, edge_to_insert in collected_edges.iteritems():
        # getting the nodes
        node_a, node_b = collected_edge_id.split('@')

        node_a_dict = nodes[node_a]
        node_b_dict = nodes[node_b]

        parser.insert_edge(node_a_dict, node_b_dict, edge_to_insert)

    print("Saving db")
    parser.save_db_to_file(DESTINATION)
def main():

    # opening the old_db for mapping
    old_db = PsimiSQL()
    old_db.import_from_db_file(SOURCE_DB_LOCATION)

    # making the script more verbose
    counter = 0
    old_db.cursor.execute("SELECT count(*) FROM node")
    number_of_nodes = old_db.cursor.fetchone()[0]

    # iterating through the old_db's nodes
    old_db.cursor.execute("SELECT * FROM node")

    # mapping old node_ids to new node old ids
    old_node_ids_dict = {}

    # initiating an empty db the maped nodes are put
    new_db = PsimiSQL()

    # declaring a counter to count the nodes that does not match
    no_match_counter = 0
    invalid_node_counter = 0

    # looping through the old_db_s nodes
    while True:
        row = old_db.cursor.fetchone()

        # communicating with user
        sys.stdout.write("Querying %d. node from dictionary out of %d\r" % (counter, number_of_nodes))
        counter += 1

        # until the last row
        if row == None:
            break
        else:
            row_id, mitab_name, alt_accession, mitab_tax_id, pathways, aliases, topology = row

            tax_id = str(mitab_tax_id.split(':')[1])
            name = str(mitab_name.split(':')[1])

            old_node_dict = {
                "id" : row_id,
                "name" : mitab_name,
                "alt_accession" : alt_accession,
                "tax_id" : mitab_tax_id,
                "pathways" : pathways,
                "aliases" : aliases,
                "topology" : topology
            }

            # if the fetched node is already mapped, just it's copy will be inserted
            #  if "uniprot" in mitab_name:
            #      add_uniprot(old_node_dict,old_node_ids_dict,new_db)
            #  else:

            query = """
                SELECT DISTINCT foreign_ids.accession, uniprot.accession, uniprot.is_swissprot, uniprot.is_primary
                FROM foreign_ids JOIN uniprot ON foreign_ids.uniprot_id = uniprot.id
                WHERE foreign_ids.accession = ? AND uniprot.tax_id = ? AND uniprot.is_primary = 1
            """

            tup = (name, tax_id)

            DICTIONARY_DB_CURSOR.execute(query, tup)
            DICTIONARY_DB.commit()

            result = DICTIONARY_DB_CURSOR.fetchall()

            if len(result) == 0:
                # if there is no match in the map for the current node
                no_match_counter+=1
            else:
                # get a list with only the swissprot nodes from the result of the SQL query
                swiss_nodes = get_swiss_arr(result)

                # getting the trembl nodes arr
                trembl_nodes = get_trembl_arr(result)

                # getting the new aliases
                aliases = get_aliases_string(trembl_nodes)

                # best case scenario it's a 1 -> 1 map
                if len(swiss_nodes) == 1:
                    swiss_accession  = "uniprot:"+swiss_nodes[0][1]
                    add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases)
                # if it maps to more than one swissprot accession, all swissprot nodes will be added
                elif len(swiss_nodes)  > 1:
                    for node in swiss_nodes:
                        swiss_accession = "uniprot:"+node[1]
                        add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases)
                # adding trembl nodes if the old node does not match any swissprot accession
                else:
                    for node in trembl_nodes:
                        trembl_accession = "trembl:"+node[1]
                        add_node(old_node_dict, old_node_ids_dict, trembl_accession, new_db, aliases)

    print("Inserting to %s nodes done" % SOURCE_DB_TYPE)

    # setting up counters, to be able to give the user some information of the ongoing process
    old_db.cursor.execute("SELECT count(*) FROM edge")
    number_of_edges = old_db.cursor.fetchone()[0]
    edge_counter = 0


    query = "SELECT * from edge"
    old_db.cursor.execute(query)

    while True:
        # informing the user
        sys.stdout.write("Parsing edge # %d out of %d\r" % (edge_counter, number_of_edges))
        row = old_db.cursor.fetchone()

        if row == None:
            break
        else:
            edge_counter += 1

            # deconstructing the row (list)
            edge_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method , first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row

            # since we get the old interactor id's from this query we can simply look up ther new id(s) in the old_node_ids dict
            # it both nodes mapped we add them as an edge to the new db
            if old_node_ids_dict.has_key(old_interactor_a_node_id) and old_node_ids_dict.has_key( old_interactor_b_node_id):

                # looping through every new 'A' node
                for new_node_id_a in old_node_ids_dict[old_interactor_a_node_id]:

                    new_node_a_dict = new_db.get_node_by_id(new_node_id_a)

                    # looping through every new 'B' node for every new 'A' node and inserting them as an edge
                    for new_node_id_b in old_node_ids_dict[old_interactor_b_node_id]:

                        new_node_b_dict = new_db.get_node_by_id(new_node_id_b)

                        # generating the new edge dict
                        new_edge_dict = {
                            'interactor_a_node_id' : new_node_id_a,
                            'interactor_b_node_id': new_node_id_b,
                            'interactor_a_node_name' : interactor_a_node_name,
                            'interactor_b_node_name': interactor_b_node_name,
                            'interaction_detection_method' : interaction_detection_method,
                            'first_author' : first_author,
                            'publication_ids' : publication_ids,
                            'source_db' : "source database:"+SOURCE_DB_TYPE,
                            'interaction_types' : interaction_types,
                            'interaction_identifiers' : interaction_identifiers,
                            'confidence_scores' : confidence_scores,
                            'layer' : layer
                        }

                        # inserting the new node
                        new_db.insert_edge(new_node_a_dict, new_node_b_dict, new_edge_dict)
            else:
                # countering the nodes that can't be inserted to the new db because one of their nodes haven't mapped
                invalid_node_counter += 1
    print("Inserting edges to %s.db finished!" % SOURCE_DB_TYPE)

    new_db.save_db_to_file(DESTINATION_DB_LOCATION)
                    help="The location of the source file for nodes")
parser.add_argument('--outfile', required=True, metavar="OutputFile", type=str,
                    help="The name and optionally location where the data should be saved.")
parser.add_argument('--psimisql', required=True, metavar="PsimiSQLLocation", type=str,
                    help="The location of the PsimiSQL class")

args = parser.parse_args()

# imports
import sys

sys.path.append(args.psimisql)
from sqlite_db_api import PsimiSQL

# initiating the memory db
db_api = PsimiSQL()

# the unique nodes will be held here
nodes = {}
edges = {}


# parsing the file
with open(args.source_file) as source_file:

    # read the first line

    source_file.readline()

    for line in source_file:
                    metavar="Dictionary-db",
                    type=str)
parser.add_argument('--prediction-name',
                    required=True,
                    metavar="PredName",
                    type=str)

args = parser.parse_args()
dictionary_api = sqlite3.connect(args.dictionary_db)
dictionary_cursor = dictionary_api.cursor()

sys.path.append(args.psimisql)
from sqlite_db_api import PsimiSQL

# initiating the memory db
db_api = PsimiSQL()

# the unique nodes will be held here
salmonella_nodes = {}
human_nodes = {}
edges = []

# parsing the map file
with open(args.map_file) as mapFile:

    # skipping header
    mapFile.readline()

    for line in mapFile:
        uniprot, salmonella = line.strip().split("\t")
Example #12
0
                    help="The name and optionally location where the data should be saved.")
parser.add_argument('--db-name', required=True, metavar="DatabaseName", type=str,
                    help="The name of the source database")
parser.add_argument('--psimisql', required=True, metavar="PsimiSQLLocation", type=str,
                    help="The location of the PsimiSQL class")

args = parser.parse_args()

# imports
import sys

sys.path.append(args.psimisql)
from sqlite_db_api import PsimiSQL

# initiating the memory db
db_api = PsimiSQL()

# parsing nodes

# nodes will be stored temporarily in memory, because re-querying them is slower
nodes = {}

# looping through the node file
with open(args.node_source_file) as node_file:

    # informing the user
    print "Parsing nodes"
    sum_nodes = sum([1 for x in node_file])
    progress = 1
    node_file.seek(0)
                    help="The location of the csv that was saved from arn")
parser.add_argument('--outfile', required=True, metavar="OutputFile", type=str,
                    help="The name and optionally location where the data should be saved.")
parser.add_argument('--psimisql', required=True, metavar="PsimiSQLLocation", type=str,
                    help="The location of the PsimiSQL class")

args = parser.parse_args()

# imports
import sys

sys.path.append(args.psimisql)
from sqlite_db_api import PsimiSQL

# initiating the memory db
db_api = PsimiSQL()

# extracting nodes
nodes = {}
edges = {}

print("Parsing file")

with open(args.source_file) as arn_file:

    # skipping the first line
    arn_file.readline()

    for line in arn_file:

        # splitting up the line
    help="The name and optionally location where the data should be saved.",
)
parser.add_argument(
    "--psimisql", required=True, metavar="PsimiSQLLocation", type=str, help="The location of the PsimiSQL class"
)

args = parser.parse_args()

# imports
import sys

sys.path.append(args.psimisql)
from sqlite_db_api import PsimiSQL

# initiating the memory db
db_api = PsimiSQL()

# the unique nodes will be held here
nodes = {}

# parsing the file

with open(args.source_file) as source_file:

    # skipping the header line

    source_file.readline()

    for line in source_file:

        linearr = line.strip().split("\t")
def main():

    for db in args.source_files:

        # informing the user
        print("Parsing %s" % db)

        cursor = sqlite3.connect(db).cursor()

        mapped_nodes = {}
        nodemap = {}

        cursor.execute("SELECT * FROM node")
        result = cursor.fetchall()

        length = len(result)
        current = 1

        new_db = PsimiSQL()

        cursor.execute("SELECT count(*) FROM node")
        num_of_nodes = cursor.fetchone()[0]

        # mapping nodes

        print("Mapping nodes")

        for line in result:

            # informing user
            if (current % 50 == 0):
                print("Mapping nodes %d/%d" % (current, length))

            current += 1

            row_id, name, alt_accession, tax_id, pathways, aliases, topology = line

            old_uniprot = name

            new_uniprot = "uniprot:"+get_primary(old_uniprot.split(':')[1])

            # storing the new uniprot id for every old id
            nodemap[old_uniprot] = new_uniprot

            mapped_node = {
                'name': new_uniprot,
                'alt_accession': alt_accession,
                'tax_id': tax_id,
                'pathways': pathways,
                'aliases': aliases,
                'topology': topology
            }

            mapped_node['id'] = new_db.insert_node(mapped_node)

            mapped_nodes[new_uniprot] = mapped_node

        if len(nodemap) != num_of_nodes:
            print "Gebasz"

        # mapping edges

        cursor.execute("SELECT * FROM edge")
        result = cursor.fetchall()

        print("Mapping edges")
        length = len(result)
        current = 1
        shit_counter = 0

        for row in result:

            if (current % 10 == 0):
               print("Parsing edge %d/%d" % (current, length))
            current += 1

            old_source_uniprot = row[3]
            old_target_uniprot = row[4]


            edge_dict = {
                'interaction_detection_method': row[5],
                'first_author': row[6],
                'publication_ids': row[7],
                'interaction_types': row[8],
                'source_db': row[9],
                'interaction_identifiers': row[10],
                'confidence_scores': row[11],
                'layer': "0"
            }

            if (old_source_uniprot not in mapped_nodes or old_target_uniprot not in mapped_nodes):
                shit_counter +=1
            else:
                new_db.insert_edge(mapped_nodes[old_source_uniprot], mapped_nodes[old_target_uniprot], edge_dict)

        # saving the mapped db and informing user

        db_name = os.path.split(db)[1]

        print("Saving db to %s " % (args.outdir+"/mapped"+db_name))
        print("SHITCOUNTER %d" % shit_counter )

        new_db.save_db_to_file(args.outdir+"/mapped"+db_name)