help="The location of the PsimiSQL class") parser.add_argument('--map-file', required=True, metavar="MapFile", type=str, help="The location of the file that maps salmonella proteins to uniprot") parser.add_argument('--dictionary-db', required=True, metavar="Dictionary-db", type=str) parser.add_argument('--prediction-name', required=True, metavar="PredName", type=str) args = parser.parse_args() dictionary_api = sqlite3.connect(args.dictionary_db) dictionary_cursor = dictionary_api.cursor() sys.path.append(args.psimisql) from sqlite_db_api import PsimiSQL # initiating the memory db db_api = PsimiSQL() # the unique nodes will be held here salmonella_nodes= {} human_nodes = {} edges = [] # parsing the map file with open(args.map_file) as mapFile: # skipping header mapFile.readline() for line in mapFile: uniprot, salmonella = line.strip().split("\t")
def main(): # opening the old_db for mapping old_db = PsimiSQL() old_db.import_from_db_file(SOURCE_DB_LOCATION) # making the script more verbose counter = 0 old_db.cursor.execute("SELECT count(*) FROM node") number_of_nodes = old_db.cursor.fetchone()[0] # iterating through the old_db's nodes old_db.cursor.execute("SELECT * FROM node") # mapping old node_ids to new node old ids old_node_ids_dict = {} # initiating an empty db the maped nodes are put new_db = PsimiSQL() # declaring a counter to count the nodes that does not match no_match_counter = 0 invalid_node_counter = 0 # looping through the old_db_s nodes while True: row = old_db.cursor.fetchone() # communicating with user sys.stdout.write("Querying %d. node from dictionary out of %d\r" % (counter, number_of_nodes)) counter += 1 # until the last row if row == None: break else: row_id, mitab_name, alt_accession, mitab_tax_id, pathways, aliases, topology = row tax_id = str(mitab_tax_id.split(':')[1]) name = str(mitab_name.split(':')[1]) old_node_dict = { "id": row_id, "name": mitab_name, "alt_accession": alt_accession, "tax_id": mitab_tax_id, "pathways": pathways, "aliases": aliases, "topology": topology } # if the fetched node is already mapped, just it's copy will be inserted # if "uniprot" in mitab_name: # add_uniprot(old_node_dict,old_node_ids_dict,new_db) # else: query = """ SELECT DISTINCT foreign_ids.accession, uniprot.accession, uniprot.is_swissprot, uniprot.is_primary FROM foreign_ids JOIN uniprot ON foreign_ids.uniprot_id = uniprot.id WHERE foreign_ids.accession = ? AND uniprot.tax_id = ? AND uniprot.is_primary = 1 """ tup = (name, tax_id) DICTIONARY_DB_CURSOR.execute(query, tup) DICTIONARY_DB.commit() result = DICTIONARY_DB_CURSOR.fetchall() if len(result) == 0: # if there is no match in the map for the current node no_match_counter += 1 else: # get a list with only the swissprot nodes from the result of the SQL query swiss_nodes = get_swiss_arr(result) # getting the trembl nodes arr trembl_nodes = get_trembl_arr(result) # getting the new aliases aliases = get_aliases_string(trembl_nodes) # best case scenario it's a 1 -> 1 map if len(swiss_nodes) == 1: swiss_accession = "uniprot:" + swiss_nodes[0][1] add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases) # if it maps to more than one swissprot accession, all swissprot nodes will be added elif len(swiss_nodes) > 1: for node in swiss_nodes: swiss_accession = "uniprot:" + node[1] add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases) # adding trembl nodes if the old node does not match any swissprot accession else: for node in trembl_nodes: trembl_accession = "trembl:" + node[1] add_node(old_node_dict, old_node_ids_dict, trembl_accession, new_db, aliases) print("Inserting to %s nodes done" % SOURCE_DB_TYPE) # setting up counters, to be able to give the user some information of the ongoing process old_db.cursor.execute("SELECT count(*) FROM edge") number_of_edges = old_db.cursor.fetchone()[0] edge_counter = 0 query = "SELECT * from edge" old_db.cursor.execute(query) while True: # informing the user sys.stdout.write("Parsing edge # %d out of %d\r" % (edge_counter, number_of_edges)) row = old_db.cursor.fetchone() if row == None: break else: edge_counter += 1 # deconstructing the row (list) edge_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row # since we get the old interactor id's from this query we can simply look up ther new id(s) in the old_node_ids dict # it both nodes mapped we add them as an edge to the new db if old_node_ids_dict.has_key( old_interactor_a_node_id) and old_node_ids_dict.has_key( old_interactor_b_node_id): # looping through every new 'A' node for new_node_id_a in old_node_ids_dict[ old_interactor_a_node_id]: new_node_a_dict = new_db.get_node_by_id(new_node_id_a) # looping through every new 'B' node for every new 'A' node and inserting them as an edge for new_node_id_b in old_node_ids_dict[ old_interactor_b_node_id]: new_node_b_dict = new_db.get_node_by_id(new_node_id_b) # generating the new edge dict new_edge_dict = { 'interactor_a_node_id': new_node_id_a, 'interactor_b_node_id': new_node_id_b, 'interactor_a_node_name': interactor_a_node_name, 'interactor_b_node_name': interactor_b_node_name, 'interaction_detection_method': interaction_detection_method, 'first_author': first_author, 'publication_ids': publication_ids, 'source_db': "source database:" + SOURCE_DB_TYPE, 'interaction_types': interaction_types, 'interaction_identifiers': interaction_identifiers, 'confidence_scores': confidence_scores, 'layer': layer } # inserting the new node new_db.insert_edge(new_node_a_dict, new_node_b_dict, new_edge_dict) else: # countering the nodes that can't be inserted to the new db because one of their nodes haven't mapped invalid_node_counter += 1 print("Inserting edges to %s.db finished!" % SOURCE_DB_TYPE) new_db.save_db_to_file(DESTINATION_DB_LOCATION)
def main(): # declaring the dicts that will hold the data nodes = {} collected_edges = {} merged_edge_counter = 0 not_merged_edge = 0 # the number of pieces (.db files) sum_files = len(SOURCE_DB_FILE_LIST) # filling up the nodes dictionary with the data contained in db piece files for db_file in SOURCE_DB_FILE_LIST: # executing a query that selects everything (but the node id) from the current SQLite .db file db = sqlite3.connect(db_file) cursor = db.cursor() cursor.execute("SELECT * FROM node WHERE tax_id = 'taxid:9606' OR tax_id = 'taxid:99284'") # iterating trough the db row by row while True: row = cursor.fetchone() # until the last row if row == None: break # if unique, inserting the node (row) to the nodes dictionary id, name, alt_accession, tax_id, pathways, aliases, topology = row node = { "name": name, "alt_accession": alt_accession, "tax_id": tax_id, "pathways": pathways, "aliases": aliases, "topology": topology, } if not nodes.has_key(name): nodes[name] = node else: nodes[name] = get_union_of_nodes(nodes[name], node) # closing the current db db.close() # logging out some info current_file = SOURCE_DB_FILE_LIST.index(db_file) sys.stdout.write("Building the node dictionary: Processing %d file out of %d\r" % (current_file, sum_files)) # making a memory database and inserting the unique nodes from the nodes dictionary print("Inserting nodes to database") parser = PsimiSQL() for node in nodes: parser.insert_unique_node(nodes[node]) nodes[node]["id"] = parser.cursor.lastrowid # looping through the files again to make an edge list print("Started building edge dict") file_counter = 1 for db_file in SOURCE_DB_FILE_LIST: sys.stdout.write("Inserting edges to edge dict from '%s' (%d/%d)\r" % (db_file, file_counter, sum_files)) # executing a query that selects everything (but the node id) from the current SQLite .db file db = sqlite3.connect(db_file) cursor = db.cursor() cursor.execute("SELECT * FROM edge") while True: row = cursor.fetchone() # if there aren't any more nodes break out of the loop if not row: break else: # deconstructing the row (list) edge_row_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = ( row ) # because in the nodes dict building process the query only asks for human and salmonella nodes # we have to make sure that we don't try to insert edges whose # nodes are in the nodes dict (=does not a node of other organism) if nodes.has_key(interactor_a_node_name) and nodes.has_key(interactor_b_node_name): # generating an edge id that will be the key in the edge dict edge_id = interactor_a_node_name + "@" + interactor_b_node_name # generating an edge dict, that will be a value for the key in the collected_edges dict current_edge = { "interaction_detection_method": interaction_detection_method, "first_author": first_author, "publication_ids": publication_ids, "interaction_types": interaction_types, "source_db": source_db, "interaction_identifiers": interaction_identifiers, "confidence_scores": confidence_scores, "layer": layer, } # if the collected_edges dict does not contain # this edge_id. the edge is stored in the collected edges if not collected_edges.has_key(edge_id): collected_edges[edge_id] = current_edge else: # if collected_edges has this id the edge will be merged collected_edge = collected_edges[edge_id] # if an edge is already in the dict it will be merged with the current edge collected_edge["interaction_types"] = merge_strings( collected_edge["interaction_types"], current_edge["interaction_types"] ) collected_edge["first_author"] = merge_strings( collected_edge["first_author"], current_edge["first_author"] ) collected_edge["source_db"] = merge_strings( collected_edge["source_db"], current_edge["source_db"] ) collected_edge["interaction_identifiers"] = merge_strings( collected_edge["interaction_identifiers"], current_edge["interaction_identifiers"] ) collected_edge["interaction_detection_method"] = merge_strings( collected_edge["interaction_detection_method"], current_edge["interaction_detection_method"] ) collected_edge["confidence_scores"] = merge_strings( collected_edge["confidence_scores"], current_edge["confidence_scores"] ) print("Building edge dict done!") print("Started inserting edges to the db") # iterating through edges dictionary and inserting nodes to the SQLite db for collected_edge_id, edge_to_insert in collected_edges.iteritems(): # getting the nodes node_a, node_b = collected_edge_id.split("@") node_a_dict = nodes[node_a] node_b_dict = nodes[node_b] parser.insert_edge(node_a_dict, node_b_dict, edge_to_insert) print("Saving db") parser.save_db_to_file(DESTINATION)
def main(): # parsing the arguments arguments = parse_arguments() # importing the PsimiSQL clas sys.path.append(arguments.sqlite_db_api) from sqlite_db_api import PsimiSQL # the nodes and the edges will be stored in a dict nodes = {} edges = {} # Parsing the nodes first and merging pathways and storing this in the nodes dict created above # (querying node to check whether it has the same pathway and than updating it with sql queries would be slow) with open(arguments.source_file) as source_file: # skipping the header line if needed if arguments.skip_header: source_file.readline() # setting up variables for informing the user num_lines = float(sum([1 for line in source_file])) line_counter = float(0) source_file.seek(0) # looping through the file for line in source_file: # infroming the user line_counter += 1 if line_counter % 50 == 0: done = (line_counter / num_lines) * 100 sys.stdout.write("Parsing mitab file (%d%%)\r" % (done)) # deconstructing the line source_acc, target_acc, source_alt_acc, target_alt_acc, source_alias, target_alias, int_det_method, author, pubmed_ids, source_tax_id, target_tax_id, int_type, source_db, confidence, pathway_ids, layer, source_topology, target_topology = line.strip( ).split("\t") source_dict = { "name": source_acc, "alt_accession": source_alt_acc, "tax_id": source_tax_id, "pathways": pathway_ids, "aliases": source_alias, "topology": source_topology } add_to_nodes(source_dict, nodes) target_dict = { "name": target_acc, "alt_accession": target_alt_acc, "tax_id": target_tax_id, "pathways": pathway_ids, "aliases": target_alias, "topology": target_topology } add_to_nodes(target_dict, nodes) # adding the edge to the edges dict edges["%s@%s" % (source_acc, target_acc)] = { 'interaction_detection_method': int_det_method, 'first_author': author, 'publication_ids': pubmed_ids, 'interaction_types': int_type, 'source_db': source_db, 'interaction_identifiers': '-', 'confidence_scores': confidence, 'layer': layer } # informing the user print("Parsing MiTAB file: Finished") # now that we have the unique nodes we can add them to the Psi-Mi-SQL database # initiating the memory Mitab database db_api = PsimiSQL() num_nodes = float(len(nodes)) line_counter = float(1) # inserting the nodes to the memory db for node_name, node_dict in nodes.items(): #informing the user if line_counter % 50 == 0: done = float((line_counter / num_nodes) * 100) sys.stdout.write("Inserting nodes to NetMiTabSQL (%s%%)" % (done)) line_counter += 1 # inserting node to the db file db_api.insert_node(node_dict) # updating (mutating) the node dict with the SQL row id in the nodes dictionary so it can be used later # (again, it is faster to store the row ids fot the rows than querying each rowid) nodes["id"] = db_api.last_row_id print("Inserting nodes to NetMiTabSQL: Done") num_edges = float(len(nodes)) line_counter = float(1) # inserting the edges to the memory db for edge_id, edge_dict in edges.items(): #informing the user if line_counter % 50 == 0: done = float((line_counter / num_edges) * 100) sys.stdout.write("Inserting nodes to NetMiTabSQL (%s%%)" % (done)) line_counter += 1 source_name, target_name = edge_id.split('@') source_dict = nodes[source_name] target_dict = nodes[target_name] db_api.insert_edge(source_dict, target_dict, edge_dict) print("Inserting edges to NetMiTabSQL: Done") print("Saving the database to filesystem") # the database is finished, saving db_api.save_db_to_file(arguments.output_file) print("Database saved")
} if not nodes.has_key(name): nodes[name] = node else: nodes[name].update(node) #closing the current db db.close() #logging out some info current_file = PIECE_LIST.index(filename) sys.stdout.write( "Building the node dictionary: Processing %d files out of %d\r" % (current_file, sum_files)) #making a memory database and inserting the unique nodes from the nodes dictionary print('Inserting nodes to database') parser = PsimiSQL(SQL_SEED) for node in nodes: parser.insert_unique_node(nodes[node]) #now that we have the nodes in the final db, the edges can be inserted #there is no need for a edges dictionary, because reading it from the files costs less memory #iterating through the .db piece files again print('Inserting edges to database') for filename in PIECE_LIST: db = sqlite3.connect(filename) query = "SELECT * FROM edge" cursor = db.cursor() cursor.execute(query) #iterating trough the current piece .db files while True:
help="The name and optionally location where the data should be saved.", ) parser.add_argument( "--psimisql", required=True, metavar="PsimiSQLLocation", type=str, help="The location of the PsimiSQL class" ) args = parser.parse_args() # imports import sys sys.path.append(args.psimisql) from sqlite_db_api import PsimiSQL # initiating the memory db db_api = PsimiSQL() # the unique nodes will be held here nodes = {} edges = {} # parsing the file with open(args.source_file) as source_file: # read the first line source_file.readline() for line in source_file:
parser.add_argument('--psimisql', required=True, metavar="PsimiSQLLocation", type=str, help="The location of the PsimiSQL class") args = parser.parse_args() # imports import sys sys.path.append(args.psimisql) from sqlite_db_api import PsimiSQL # initiating the memory db db_api = PsimiSQL() # the unique nodes will be held here nodes = {} # parsing the file with open(args.source_file) as source_file: # skipping the header line source_file.readline() for line in source_file: linearr = line.strip().split("\t")
def main(): # declaring the dicts that will hold the data nodes = {} collected_edges = {} merged_edge_counter = 0 not_merged_edge = 0 # the number of pieces (.db files) sum_files = len(SOURCE_DB_FILE_LIST) # filling up the nodes dictionary with the data contained in db piece files for db_file in SOURCE_DB_FILE_LIST: # executing a query that selects everything (but the node id) from the current SQLite .db file db = sqlite3.connect(db_file) cursor = db.cursor() cursor.execute( "SELECT * FROM node WHERE tax_id = 'taxid:9606' OR tax_id = 'taxid:99284'" ) # iterating trough the db row by row while True: row = cursor.fetchone() # until the last row if row == None: break # if unique, inserting the node (row) to the nodes dictionary id, name, alt_accession, tax_id, pathways, aliases, topology = row node = { "name": name, 'alt_accession': alt_accession, 'tax_id': tax_id, 'pathways': pathways, 'aliases': aliases, 'topology': topology } if not nodes.has_key(name): nodes[name] = node else: nodes[name] = get_union_of_nodes(nodes[name], node) # closing the current db db.close() # logging out some info current_file = SOURCE_DB_FILE_LIST.index(db_file) sys.stdout.write( "Building the node dictionary: Processing %d file out of %d\r" % (current_file, sum_files)) # making a memory database and inserting the unique nodes from the nodes dictionary print('Inserting nodes to database') parser = PsimiSQL() for node in nodes: parser.insert_unique_node(nodes[node]) nodes[node]['id'] = parser.cursor.lastrowid # looping through the files again to make an edge list print("Started building edge dict") file_counter = 1 for db_file in SOURCE_DB_FILE_LIST: sys.stdout.write("Inserting edges to edge dict from '%s' (%d/%d)\r" % (db_file, file_counter, sum_files)) # executing a query that selects everything (but the node id) from the current SQLite .db file db = sqlite3.connect(db_file) cursor = db.cursor() cursor.execute("SELECT * FROM edge") while True: row = cursor.fetchone() # if there aren't any more nodes break out of the loop if not row: break else: # deconstructing the row (list) edge_row_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row # because in the nodes dict building process the query only asks for human and salmonella nodes # we have to make sure that we don't try to insert edges whose # nodes are in the nodes dict (=does not a node of other organism) if nodes.has_key(interactor_a_node_name) and nodes.has_key( interactor_b_node_name): # generating an edge id that will be the key in the edge dict edge_id = interactor_a_node_name + "@" + interactor_b_node_name # generating an edge dict, that will be a value for the key in the collected_edges dict current_edge = { 'interaction_detection_method': interaction_detection_method, 'first_author': first_author, 'publication_ids': publication_ids, 'interaction_types': interaction_types, 'source_db': source_db, 'interaction_identifiers': interaction_identifiers, 'confidence_scores': confidence_scores, 'layer': layer } # if the collected_edges dict does not contain # this edge_id. the edge is stored in the collected edges if not collected_edges.has_key(edge_id): collected_edges[edge_id] = current_edge else: # if collected_edges has this id the edge will be merged collected_edge = collected_edges[edge_id] # if an edge is already in the dict it will be merged with the current edge collected_edge['interaction_types'] = merge_strings( collected_edge['interaction_types'], current_edge['interaction_types']) collected_edge['first_author'] = merge_strings( collected_edge['first_author'], current_edge['first_author']) collected_edge['source_db'] = merge_strings( collected_edge['source_db'], current_edge['source_db']) collected_edge[ 'interaction_identifiers'] = merge_strings( collected_edge['interaction_identifiers'], current_edge['interaction_identifiers']) collected_edge[ 'interaction_detection_method'] = merge_strings( collected_edge['interaction_detection_method'], current_edge['interaction_detection_method']) collected_edge['confidence_scores'] = merge_strings( collected_edge['confidence_scores'], current_edge['confidence_scores']) print("Building edge dict done!") print("Started inserting edges to the db") # iterating through edges dictionary and inserting nodes to the SQLite db for collected_edge_id, edge_to_insert in collected_edges.iteritems(): # getting the nodes node_a, node_b = collected_edge_id.split('@') node_a_dict = nodes[node_a] node_b_dict = nodes[node_b] parser.insert_edge(node_a_dict, node_b_dict, edge_to_insert) print("Saving db") parser.save_db_to_file(DESTINATION)
def main(): # opening the old_db for mapping old_db = PsimiSQL() old_db.import_from_db_file(SOURCE_DB_LOCATION) # making the script more verbose counter = 0 old_db.cursor.execute("SELECT count(*) FROM node") number_of_nodes = old_db.cursor.fetchone()[0] # iterating through the old_db's nodes old_db.cursor.execute("SELECT * FROM node") # mapping old node_ids to new node old ids old_node_ids_dict = {} # initiating an empty db the maped nodes are put new_db = PsimiSQL() # declaring a counter to count the nodes that does not match no_match_counter = 0 invalid_node_counter = 0 # looping through the old_db_s nodes while True: row = old_db.cursor.fetchone() # communicating with user sys.stdout.write("Querying %d. node from dictionary out of %d\r" % (counter, number_of_nodes)) counter += 1 # until the last row if row == None: break else: row_id, mitab_name, alt_accession, mitab_tax_id, pathways, aliases, topology = row tax_id = str(mitab_tax_id.split(':')[1]) name = str(mitab_name.split(':')[1]) old_node_dict = { "id" : row_id, "name" : mitab_name, "alt_accession" : alt_accession, "tax_id" : mitab_tax_id, "pathways" : pathways, "aliases" : aliases, "topology" : topology } # if the fetched node is already mapped, just it's copy will be inserted # if "uniprot" in mitab_name: # add_uniprot(old_node_dict,old_node_ids_dict,new_db) # else: query = """ SELECT DISTINCT foreign_ids.accession, uniprot.accession, uniprot.is_swissprot, uniprot.is_primary FROM foreign_ids JOIN uniprot ON foreign_ids.uniprot_id = uniprot.id WHERE foreign_ids.accession = ? AND uniprot.tax_id = ? AND uniprot.is_primary = 1 """ tup = (name, tax_id) DICTIONARY_DB_CURSOR.execute(query, tup) DICTIONARY_DB.commit() result = DICTIONARY_DB_CURSOR.fetchall() if len(result) == 0: # if there is no match in the map for the current node no_match_counter+=1 else: # get a list with only the swissprot nodes from the result of the SQL query swiss_nodes = get_swiss_arr(result) # getting the trembl nodes arr trembl_nodes = get_trembl_arr(result) # getting the new aliases aliases = get_aliases_string(trembl_nodes) # best case scenario it's a 1 -> 1 map if len(swiss_nodes) == 1: swiss_accession = "uniprot:"+swiss_nodes[0][1] add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases) # if it maps to more than one swissprot accession, all swissprot nodes will be added elif len(swiss_nodes) > 1: for node in swiss_nodes: swiss_accession = "uniprot:"+node[1] add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases) # adding trembl nodes if the old node does not match any swissprot accession else: for node in trembl_nodes: trembl_accession = "trembl:"+node[1] add_node(old_node_dict, old_node_ids_dict, trembl_accession, new_db, aliases) print("Inserting to %s nodes done" % SOURCE_DB_TYPE) # setting up counters, to be able to give the user some information of the ongoing process old_db.cursor.execute("SELECT count(*) FROM edge") number_of_edges = old_db.cursor.fetchone()[0] edge_counter = 0 query = "SELECT * from edge" old_db.cursor.execute(query) while True: # informing the user sys.stdout.write("Parsing edge # %d out of %d\r" % (edge_counter, number_of_edges)) row = old_db.cursor.fetchone() if row == None: break else: edge_counter += 1 # deconstructing the row (list) edge_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method , first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row # since we get the old interactor id's from this query we can simply look up ther new id(s) in the old_node_ids dict # it both nodes mapped we add them as an edge to the new db if old_node_ids_dict.has_key(old_interactor_a_node_id) and old_node_ids_dict.has_key( old_interactor_b_node_id): # looping through every new 'A' node for new_node_id_a in old_node_ids_dict[old_interactor_a_node_id]: new_node_a_dict = new_db.get_node_by_id(new_node_id_a) # looping through every new 'B' node for every new 'A' node and inserting them as an edge for new_node_id_b in old_node_ids_dict[old_interactor_b_node_id]: new_node_b_dict = new_db.get_node_by_id(new_node_id_b) # generating the new edge dict new_edge_dict = { 'interactor_a_node_id' : new_node_id_a, 'interactor_b_node_id': new_node_id_b, 'interactor_a_node_name' : interactor_a_node_name, 'interactor_b_node_name': interactor_b_node_name, 'interaction_detection_method' : interaction_detection_method, 'first_author' : first_author, 'publication_ids' : publication_ids, 'source_db' : "source database:"+SOURCE_DB_TYPE, 'interaction_types' : interaction_types, 'interaction_identifiers' : interaction_identifiers, 'confidence_scores' : confidence_scores, 'layer' : layer } # inserting the new node new_db.insert_edge(new_node_a_dict, new_node_b_dict, new_edge_dict) else: # countering the nodes that can't be inserted to the new db because one of their nodes haven't mapped invalid_node_counter += 1 print("Inserting edges to %s.db finished!" % SOURCE_DB_TYPE) new_db.save_db_to_file(DESTINATION_DB_LOCATION)
help="The location of the source file for nodes") parser.add_argument('--outfile', required=True, metavar="OutputFile", type=str, help="The name and optionally location where the data should be saved.") parser.add_argument('--psimisql', required=True, metavar="PsimiSQLLocation", type=str, help="The location of the PsimiSQL class") args = parser.parse_args() # imports import sys sys.path.append(args.psimisql) from sqlite_db_api import PsimiSQL # initiating the memory db db_api = PsimiSQL() # the unique nodes will be held here nodes = {} edges = {} # parsing the file with open(args.source_file) as source_file: # read the first line source_file.readline() for line in source_file:
metavar="Dictionary-db", type=str) parser.add_argument('--prediction-name', required=True, metavar="PredName", type=str) args = parser.parse_args() dictionary_api = sqlite3.connect(args.dictionary_db) dictionary_cursor = dictionary_api.cursor() sys.path.append(args.psimisql) from sqlite_db_api import PsimiSQL # initiating the memory db db_api = PsimiSQL() # the unique nodes will be held here salmonella_nodes = {} human_nodes = {} edges = [] # parsing the map file with open(args.map_file) as mapFile: # skipping header mapFile.readline() for line in mapFile: uniprot, salmonella = line.strip().split("\t")
help="The name and optionally location where the data should be saved.") parser.add_argument('--db-name', required=True, metavar="DatabaseName", type=str, help="The name of the source database") parser.add_argument('--psimisql', required=True, metavar="PsimiSQLLocation", type=str, help="The location of the PsimiSQL class") args = parser.parse_args() # imports import sys sys.path.append(args.psimisql) from sqlite_db_api import PsimiSQL # initiating the memory db db_api = PsimiSQL() # parsing nodes # nodes will be stored temporarily in memory, because re-querying them is slower nodes = {} # looping through the node file with open(args.node_source_file) as node_file: # informing the user print "Parsing nodes" sum_nodes = sum([1 for x in node_file]) progress = 1 node_file.seek(0)
help="The location of the csv that was saved from arn") parser.add_argument('--outfile', required=True, metavar="OutputFile", type=str, help="The name and optionally location where the data should be saved.") parser.add_argument('--psimisql', required=True, metavar="PsimiSQLLocation", type=str, help="The location of the PsimiSQL class") args = parser.parse_args() # imports import sys sys.path.append(args.psimisql) from sqlite_db_api import PsimiSQL # initiating the memory db db_api = PsimiSQL() # extracting nodes nodes = {} edges = {} print("Parsing file") with open(args.source_file) as arn_file: # skipping the first line arn_file.readline() for line in arn_file: # splitting up the line
help="The name and optionally location where the data should be saved.", ) parser.add_argument( "--psimisql", required=True, metavar="PsimiSQLLocation", type=str, help="The location of the PsimiSQL class" ) args = parser.parse_args() # imports import sys sys.path.append(args.psimisql) from sqlite_db_api import PsimiSQL # initiating the memory db db_api = PsimiSQL() # the unique nodes will be held here nodes = {} # parsing the file with open(args.source_file) as source_file: # skipping the header line source_file.readline() for line in source_file: linearr = line.strip().split("\t")
def main(): for db in args.source_files: # informing the user print("Parsing %s" % db) cursor = sqlite3.connect(db).cursor() mapped_nodes = {} nodemap = {} cursor.execute("SELECT * FROM node") result = cursor.fetchall() length = len(result) current = 1 new_db = PsimiSQL() cursor.execute("SELECT count(*) FROM node") num_of_nodes = cursor.fetchone()[0] # mapping nodes print("Mapping nodes") for line in result: # informing user if (current % 50 == 0): print("Mapping nodes %d/%d" % (current, length)) current += 1 row_id, name, alt_accession, tax_id, pathways, aliases, topology = line old_uniprot = name new_uniprot = "uniprot:"+get_primary(old_uniprot.split(':')[1]) # storing the new uniprot id for every old id nodemap[old_uniprot] = new_uniprot mapped_node = { 'name': new_uniprot, 'alt_accession': alt_accession, 'tax_id': tax_id, 'pathways': pathways, 'aliases': aliases, 'topology': topology } mapped_node['id'] = new_db.insert_node(mapped_node) mapped_nodes[new_uniprot] = mapped_node if len(nodemap) != num_of_nodes: print "Gebasz" # mapping edges cursor.execute("SELECT * FROM edge") result = cursor.fetchall() print("Mapping edges") length = len(result) current = 1 shit_counter = 0 for row in result: if (current % 10 == 0): print("Parsing edge %d/%d" % (current, length)) current += 1 old_source_uniprot = row[3] old_target_uniprot = row[4] edge_dict = { 'interaction_detection_method': row[5], 'first_author': row[6], 'publication_ids': row[7], 'interaction_types': row[8], 'source_db': row[9], 'interaction_identifiers': row[10], 'confidence_scores': row[11], 'layer': "0" } if (old_source_uniprot not in mapped_nodes or old_target_uniprot not in mapped_nodes): shit_counter +=1 else: new_db.insert_edge(mapped_nodes[old_source_uniprot], mapped_nodes[old_target_uniprot], edge_dict) # saving the mapped db and informing user db_name = os.path.split(db)[1] print("Saving db to %s " % (args.outdir+"/mapped"+db_name)) print("SHITCOUNTER %d" % shit_counter ) new_db.save_db_to_file(args.outdir+"/mapped"+db_name)