def delete_data(database_name, table_name, primary_key, primary_key_content): """ This function is to delete information from database :param database_name: string The database to connect to :param table_name: string The database table being deleted :param primary_key: string The primary key of the database table :param primary_key_content: string The specific primary key content to be deleted :return: """ conn = connect_database(database_name) c = conn.cursor() sql = ("delete from '{0}' where {1} = '{2}'".format( table_name, primary_key, primary_key_content)) c.execute(sql) conn.commit() conn.close() return 0
def get_mesh_disease_info(database_name, table_name, disease_name, primary_key): """ Query the disease information from MESH :param database_name: string The database to connect to :param table_name: string The database table being queried :param disease_name: string The MESH disease name being queried :param primary_key: string The primary key of database table :return: query_list Information in query_list represent MESH_ID, GENE_LIST, NUM_OF_GENE respectively """ conn = connect_database(database_name) c = conn.cursor() sql = ("select * from {0} where {1} = '{2}' ".format( table_name, primary_key, disease_name)) c.execute(sql) query_info = c.fetchall() query_info = query_info[0] query_list = list(query_info) conn.close() return query_list
def get_all_gene_num(database_name, table_name): """ Query the gene number of all GDAs (mesh-gene) :param database_name: string The database to connect to :param table_name: string The database table being queried :return: all_gene_num The number of all the genes """ conn = connect_database(database_name) c = conn.cursor() all_gene_list = list() sql = ("select * from {0}".format(table_name)) c.execute(sql) query_list = c.fetchall() for query_line in query_list: tuple_list = query_line[1] tuple_list = tuple_list.split(',') for gene in tuple_list: if gene not in all_gene_list: all_gene_list.append(gene) all_gene_num = len(all_gene_list) return all_gene_num
def get_item_info(database_name, table_name, item_name): """ Query item-element information from item table :param database_name: string The database to connect to :param table_name: string The database table being queried :param item_name: string The item for query :return: query_list Information in query_list represent ITEM_ID, ELEMENT_LIST, ELEMENT_NUM, respectively. """ conn = connect_database(database_name) c = conn.cursor() sql = ("select * from {0} where ITEM_ID = '{1}'".format( table_name, item_name)) c.execute(sql) query_info = c.fetchall() query_info = query_info[0] query_list = list(query_info) conn.close() return query_list
def get_all_ICD10cm_disease_group_num(database_name, table_name): """ Query the MESH disease number of all GDAs (mesh-gene) :param database_name: string The database to connect to :param table_name: string The database table being queried :return: all_ICD10cm_disease_group_num The number of all the ICD10cm disease groups """ conn = connect_database(database_name) c = conn.cursor() all_ICD10cm_disease_group_list = list() sql = ("select * from {0}".format(table_name)) c.execute(sql) query_list = c.fetchall() for query_line in query_list: icd10cm_disease_group = query_line[0] if icd10cm_disease_group not in all_ICD10cm_disease_group_list: all_ICD10cm_disease_group_list.append(icd10cm_disease_group) all_ICD10cm_disease_group_num = len(all_ICD10cm_disease_group_list) return all_ICD10cm_disease_group_num
def get_all_item_num(database_name, table_name): """ Query all element number :param database_name: string The database to connect to :param table_name: string The database table being queried :return: all_item_num The number of all the items """ conn = connect_database(database_name) c = conn.cursor() all_item_list = list() sql = ("select * from [{0}]".format(table_name)) c.execute(sql) query_list = c.fetchall() for query_line in query_list: mesh = query_line[0] if mesh not in all_item_list: all_item_list.append(mesh) all_item_num = len(all_item_list) return all_item_num # print(get_all_item_num('sample','item')) # print(get_all_group_num('sample','group'))
def get_all_group_num(database_name, table_name): """ Query all group number :param database_name: string The database to connect to :param table_name: string The group table name :return: all_group_num The number of all the group """ conn = connect_database(database_name) c = conn.cursor() all_group_list = list() sql = ("select * from [{0}]".format(table_name)) c.execute(sql) query_list = c.fetchall() for query_line in query_list: group = query_line[0] if group not in all_group_list: all_group_list.append(group) all_group_num = len(all_group_list) return all_group_num
def get_all_element_list(database_name, table_name): """ Query the list of all elements :param database_name: string The database to connect to :param table_name: string The item table :return: all_element_list The list of all elements """ conn = connect_database(database_name) c = conn.cursor() all_element_list = list() sql = ("select * from [{0}]".format(table_name)) c.execute(sql) query_list = c.fetchall() for query_line in query_list: tuple_list = query_line[1] tuple_list = tuple_list.split(',') for gene in tuple_list: if gene not in all_element_list: all_element_list.append(gene) return all_element_list
def get_icd_diseasegroup_diseaseinfo(database_name, table_name, primary_key, disease_group_name): """ Query the disease information of a ICD10cm disease group :param database_name: string The database to connect to :param table_name: string The database_table being queried :param primary_key: string The primary key of database table :param disease_group_name: The ICD10cm disease group name being queried :return: query_list Information in query_list represent GROUP_ID,DISEASE_LIST,NUM_OF_DISEASE respectively. """ conn = connect_database(database_name) c = conn.cursor() sql = ("select * from {0} where {1} = '{2}' ".format( table_name, primary_key, disease_group_name)) c.execute(sql) query_info = c.fetchall() query_info = query_info[0] query_list = list(query_info) conn.close() return query_list
def mesh_gene_info_txt_into_database(database_name, table_name, txt_file_path): """ Transfer MESH_GENE GDAs data from .txt file to database Every line in GDAs represents a gene-disease association e.g: 50518 MESH:D003920 50518 MESH:D003924 number string is the ID of gene MESH string is the ID of disease,obtained from MeSH(Medical Subject Headings) The degree of a disease is the number of genes associated with that disease, while the degree of a gene is the number of diseases annotated with that gene :param database_name: string The database to connect to :param table_name: string The database table name that store GDAs data :param txt_file_path: string The path of GDAs file :return: """ f = open(txt_file_path, 'r+') linelist = f.readlines() gdas_dt = {} for line in linelist: line = line.strip('\n') line = line.split(' ') if line[0].isdigit() is True: gdas_dt.setdefault('{0}'.format(line[1]), []).append(line[0]) if line[1].isdigit() is True: gdas_dt.setdefault('{0}'.format(line[0]), []).append(line[1]) # 将字典写入json文件中 # json_str = json.dumps(gdas_dt, indent=4) # with open('../data/{0}_dt.json'.format(table_name), 'w') as f: # f.write(json_str) conn = connect_database(database_name) c = conn.cursor() sql = "create table {0} (DISEASE_ID VARCHAR(50) PRIMARY KEY, GENE_LIST VARCHAR(255), NUM_OF_GENE INT(10))".format( table_name) c.execute(sql) for disease in gdas_dt: infolist = gdas_dt[disease] str = ','.join(infolist) # 用","连接元素,构建成新的字符串 sql = "insert into {0}(DISEASE_ID,NUM_OF_GENE,GENE_LIST) values('%s',%d,'%s')".format(table_name) \ % (disease, len(gdas_dt[disease]), str) c.execute(sql) conn.commit() conn.close()
def get_2group_shared_items(database_name, table_name, group1, group2): """ Query the shared items information between group1 and group2 :param database_name: string The database to connect to :param table_name: string The item table being queried :param group_name1: string group_id of group1 :param group_name2: string group_id of group2 :return: query_list Information in query_list represent group1,group2,shared_item_list,shared_item_num respectively. """ conn = connect_database(database_name) c = conn.cursor() sql = ("select * from [{0}] where GROUP_ID = '{1}' ".format( table_name, group1)) c.execute(sql) query1_info = c.fetchall() query1_info = query1_info[0] query1_1ist = list(query1_info) sql = ("select * from [{0}] where GROUP_ID = '{1}' ".format( table_name, group2)) c.execute(sql) query2_info = c.fetchall() query2_info = query2_info[0] query2_1ist = list(query2_info) itemlist1 = query1_1ist[1] itemlist1 = itemlist1.split(',') itemset1 = set(itemlist1) itemlist2 = query2_1ist[1] itemlist2 = itemlist2.split(',') itemset2 = set(itemlist2) shared_itemset = itemset1 & itemset2 shared_itemlist = list(shared_itemset) num_of_shared_item = len(shared_itemlist) shared_itemlist = ','.join(shared_itemlist) query_list = list() query_list.append(group1) query_list.append(group2) query_list.append(shared_itemlist) query_list.append(num_of_shared_item) conn.close() return query_list
def get_2item_shared_elements(database_name, table_name, item1, item2): """ Query the shared elements information between item1 and item2 :param database_name: string The database to connect to :param table_name: string The item table being queried :param item1: string item_id of item1 :param item2: string item_id of item2 :return: query_list Information in query_list represent item1,item2,shared_element_list,shared_element_num respectively. """ conn = connect_database(database_name) c = conn.cursor() sql = ("select * from [{0}] where ITEM_ID = '{1}' ".format( table_name, item1)) c.execute(sql) query1_info = c.fetchall() query1_info = query1_info[0] query1_1ist = list(query1_info) sql = ("select * from [{0}] where ITEM_ID = '{1}' ".format( table_name, item2)) c.execute(sql) query2_info = c.fetchall() query2_info = query2_info[0] query2_1ist = list(query2_info) elementlist1 = query1_1ist[1] elementlist1 = elementlist1.split(',') elementset1 = set(elementlist1) elementlist2 = query2_1ist[1] elementlist2 = elementlist2.split(',') elementset2 = set(elementlist2) shared_elementset = elementset1 & elementset2 shared_elementlist = list(shared_elementset) num_of_shared_element = len(shared_elementlist) shared_elementlist = ','.join(shared_elementlist) query_list = list() query_list.append(item1) query_list.append(item2) query_list.append(shared_elementlist) query_list.append(num_of_shared_element) conn.close() return query_list
def update_data(database_name, table_name): """ This function is used to update information in database :param database_name: string The database to connect to :param table_name: string The database table being updated :return: """ conn = connect_database(database_name) c = conn.cursor() if table_name is 'item': update_info = input( "please enter update information(ITEM_ID,ELEMENT_LIST,ELEMENT_NUM),split by ' ':" ) update_info = update_info.split(' ') item_id = update_info[0] element_list = update_info[1] element_num = update_info[2] sql = ( "update [{0}] set ELEMENT_LIST ='{1}',ELEMENT_NUM = '{2}' where ITEM_ID = {3}" .format(table_name, element_list, element_num, item_id)) c.execute(sql) conn.commit() elif table_name is 'group': update_info = input( "please enter update information(GROUP_ID,ITEM_LIST,ITEM_NUM,ELEMENT_LIST,ELEMENT_NUM)" ",split by ' ':") update_info = update_info.split(' ') group_id = update_info[0] item_list = update_info[1] item_num = update_info[2] element_list = update_info[3] element_num = update_info[4] sql = ( "update [{0}] set ITEM_LIST ='{1}',ELEMENT_LIST = '{2}',ITEM_NUM = '{3}',ELEMENT_NUM='{4}' " "where GROUP_ID = {5}".format(table_name, item_list, element_list, item_num, element_num, group_id)) c.execute(sql) conn.commit() else: print("no such table exists in '{0}' database".format(database_name)) conn.close() return 0
def insert_data(database_name, table_name): """ This function is used to insert data into database :param database_name: string The database to connect to :param table_name: string The database table being inserted :return: """ conn = connect_database(database_name) c = conn.cursor() if table_name is 'item': insert_info = input( "please enter insert information(ITEM_ID,ELEMENT_LIST,ELEMENT_NUM),split by ' ':" ) insert_info = insert_info.split(' ') item_id = insert_info[0] element_list = insert_info[1] element_num = insert_info[2] sql = ( "insert into {0} (ITEM_ID,ELEMENT_LIST,ELEMENT_NUM) values ({1},{2},{3})" .format(table_name, item_id, element_list, element_num)) c.execute(sql) conn.commit() elif table_name is 'group': insert_info = input( "please enter insert information(GROUP_ID,ITEM_LIST,ITEM_NUM,ELEMENT_LIST,ELEMENT_NUM)" ",split by ' ':") insert_info = insert_info.split(' ') group_id = insert_info[0] item_list = insert_info[1] item_num = insert_info[2] element_list = insert_info[3] element_num = insert_info[4] sql = ( "insert into [{0}] (GROUP_ID,ITEM_LIST,ITEM_NUM,ELEMENT_LIST,ELEMENT_NUM) values ({1},{2},{3},{4},{5})" .format(table_name, group_id, item_list, item_num, element_list, element_num)) c.execute(sql) conn.commit() else: print("no such table exists in '{0}' database".format(database_name)) conn.close() return 0
def group_info_txt_into_database(database_name, table_name, txt_file_path): """ Transfer ICD10CM group information from TXT into database Every line in txt contains a group information. e.g: ICD10CM:A48 ['MESH:D007877', 'MESH:D012772'] ICD10CM:A49 ['MESH:D001424'] ... The first string represents the name of this group. The second string represents diseases contains in this group :param database_name: string The database to connect to :param table_name: string The database table name that store group data :param txt_file_path: string The path of group information file :return: """ conn = connect_database(database_name) c = conn.cursor() sql = "create table {0} (GROUP_ID VARCHAR(50) PRIMARY KEY, DISEASE_LIST VARCHAR(255)," \ "NUM_OF_DISEASE INT(10))".format(table_name) c.execute(sql) f = open(txt_file_path, 'r+') linelist = f.readlines() for line in linelist: list = [] line = line.strip('\n') line = line.split(' ') group_name = line[0] disease_list = line[1] disease_list = disease_list.strip('[]') disease_list = disease_list.split(', ') for disease in disease_list: disease = eval(disease) # 去掉字符串两端的单引号 list.append(disease) num_of_disease = len(list) disease_str = ','.join(list) sql = "insert into {0} (GROUP_ID,DISEASE_LIST,NUM_OF_DISEASE) VALUES ('%s','%s',%d)".format(table_name) \ % (group_name, disease_str, num_of_disease) c.execute(sql) conn.commit() conn.close()
def cal_2indijaccard(database_name, table_name, primary_key_name, individual_name1, individual_name2): """ Calculate similarity between two diseases based on Jaccard Index :param database_name: string The database to connect to :param table_name: string The database table to be queried :param primary_key_name: string The primary key used to query disease information in database :param individual_name1: string A disease for calculating similarity :param individual_name2: string A disease for calculating similarity :return: jaccard_num The jaccard similarity between two disease """ conn = connect_database(database_name) c = conn.cursor() sql1 = ("select * from {0} where {1} = '{2}'".format(table_name, primary_key_name, individual_name1)) sql2 = ("select * from {0} where {1} = '{2}'".format(table_name, primary_key_name, individual_name2)) c.execute(sql1) infolist1 = c.fetchall() c.execute(sql2) infolist2 = c.fetchall() for member in infolist1: tuple_list1 = member[1] # get the causing gene list from the result of querying tuple_list1 = tuple_list1.split(',') set_list1 = set(tuple_list1) # transfer the causing gene list into set, set A set_list1_len = len(set_list1) # the size of the causing gene set A for member2 in infolist2: tuple_list2 = member2[1] tuple_list2 = tuple_list2.split(',') set_list2 = set(tuple_list2) # set B set_list2_len = len(set_list2) # the size of causing gene set B set1_and_set2_list = set_list1 & set_list2 # the intersection of set A and set B # print(set1_and_set2_list) set1_and_set2_list_len = len(set1_and_set2_list) # the size of the intersection of set A and set B jaccard_num = set1_and_set2_list_len / (set_list1_len + set_list2_len - set1_and_set2_list_len) jaccard_num = '%.5f' % jaccard_num conn.close() return jaccard_num
def get_icd_diseasegroup_geneinfo(database_name, table_name, primary_key, disease_group_name): """ Query the gene information of a ICD10cm disease group :param database_name: string The database to connect to :param table_name: string The database table being queried :param primary_key: string The primary key of database table :param disease_group_name: string The ICD10cm disease group being queried :return: query_list Information in query_list represent GROUP_ID,GENE_LIST,NUM_OF_GENE respectively. """ conn = connect_database(database_name) c = conn.cursor() sql = ("select * from {0} where {1} = '{2}' ".format( table_name, primary_key, disease_group_name)) c.execute(sql) query_info = c.fetchall() query_info = query_info[0] disease_query_list = list(query_info) # print(disease_query_list) disease_list = disease_query_list[1] disease_list = disease_list.split(',') all_genelist = [] for disease in disease_list: gene_list = get_mesh_disease_info(database_name, 'mesh_gene', '{}'.format(disease), 'DISEASE_ID')[1] gene_list = gene_list.split(',') for gene in gene_list: if gene not in all_genelist: all_genelist.append(gene) all_gene = ','.join(all_genelist) query_list = list() query_list.append(disease_query_list[0]) query_list.append(all_gene) query_list.append(len(all_genelist)) conn.close() return query_list
def cal_2itemlin(database_name, table_name, item1, item2): """ Calculate similarity between two items based on Lin's method :param database_name: string The database to connect to :param table_name: string The database table to be queried :param item1: string An item for calculating similarity :param item2: string An item for calculating similarity :return: lin_num The Lin's method similarity between two items """ conn = connect_database(database_name) c = conn.cursor() sql1 = ("select * from [{0}] where ITEM_ID = '{1}'".format(table_name, item1)) sql2 = ("select * from [{0}] where ITEM_ID = '{1}'".format(table_name, item2)) c.execute(sql1) infolist1 = c.fetchall() c.execute(sql2) infolist2 = c.fetchall() for member in infolist1: tuple_list1 = member[1] # get the causing gene list from the result of querying tuple_list1 = tuple_list1.split(',') set_list1 = set(tuple_list1) # transfer the causing gene list into set, set A set_list1_len = len(set_list1) # the size of the causing gene set A for member2 in infolist2: tuple_list2 = member2[1] tuple_list2 = tuple_list2.split(',') set_list2 = set(tuple_list2) # set B set_list2_len = len(set_list2) # the size of causing gene set B set1_and_set2_list = set_list1 & set_list2 # the intersection of set A and set B # print(set1_and_set2_list) set1_and_set2_list_len = len(set1_and_set2_list) # the size of the intersection of set A and set B # print(set_list1_len, set_list2_len, set1_and_set2_list_len) lin_num = (2 * set1_and_set2_list_len) / (set_list1_len + set_list2_len) lin_num = '%.5f' % lin_num conn.close() return lin_num
def cal_2modulemathurscore(database_name, table_name, primary_key, group_name1, group_name2): """ Calculate similarity between two disease groups based on Mathur's score method :param database_name: string The database to connect to :param table_name: string The database table being queried :param primary_key: string The primary key of database table :param group_name1: string A disease group for calculating similarity :param group_name2: string A disease group for calculating similarity :return: mathur_score The Mathur's score method similarity between two disease group """ conn = connect_database(database_name) c = conn.cursor() sql = ("select * from group_mathur where GROUP_ID = '{0}'".format( group_name1)) c.execute(sql) mathur_info1 = c.fetchall() mathur_info1 = mathur_info1[0] max_sim_group1_i = mathur_info1[1] max_sim_group1_i = float(max_sim_group1_i) sql = ("select * from group_mathur where GROUP_ID = '{0}'".format( group_name2)) c.execute(sql) mathur_info2 = c.fetchall() mathur_info2 = mathur_info2[0] max_sim_group2_i = mathur_info2[1] max_sim_group2_i = float(max_sim_group2_i) mathur = cal_2modulemathur(database_name, table_name, primary_key, group_name1, group_name2) mathur_score = mathur / ((max_sim_group1_i + max_sim_group2_i) / 2) mathur_score = '%.5f' % mathur_score conn.close() return mathur_score
def write_item_info_into_database(database_name, table_name, txt_file_path): """ Transfer item-element data from txt file to database. Every line in txt file represents a item-element association. The first and the last field of each line represent item_id, element_id respectively. :param database_name: string The database to connect to :param table_name: string The database table to store item-element data :param txt_file_path: string The path of item-element association txt file. :return: """ f = open(txt_file_path, 'r+') linelist = f.readlines() item_element_dt = {} for line in linelist: line = line.strip('\n') line = line.split(' ') item_element_dt.setdefault('{0}'.format(line[0]), []).append(line[1]) # 将字典写入json文件中 # json_str = json.dumps(item_element_dt, indent=4) # with open('../data/{0}_dt.json'.format(table_name), 'w') as f: # f.write(json_str) # 链接数据库,sql语句创建数据库表item conn = connect_database(database_name) c = conn.cursor() sql = "create table {0} (ITEM_ID VARCHAR(50) PRIMARY KEY, ELEMENT_LIST VARCHAR(255), " \ "ELEMENT_NUM INT(10))".format(table_name) c.execute(sql) for item in item_element_dt: infolist = item_element_dt[item] str = ','.join(infolist) # 用","连接元素,构建成新的字符串 sql = "insert into {0}(ITEM_ID,ELEMENT_NUM,ELEMENT_LIST) values('%s',%d,'%s')".format(table_name) \ % (item, len(item_element_dt[item]), str) c.execute(sql) conn.commit() conn.close()
def write_group_info_into_database(database_name, table_name, txt_file_path): """ Transfer group-item-element data from txt file to database. Every line in txt file represents a item-element association. The first and the last field of each line represent item_id, element_id respectively. :param database_name: string The database to connect to :param table_name: string The database table to store item-element data :param txt_file_path: string The path of item-element association txt file. :return: """ f = open(txt_file_path, 'r+') linelist = f.readlines() group_item_dt = {} for line in linelist: line = line.strip('\n') line = line.split(' ') group_item_dt.setdefault('{0}'.format(line[0]), []).append(line[1]) # 将字典写入json文件中 # json_str = json.dumps(item_element_dt, indent=4) # with open('../data/{0}_dt.json'.format(table_name), 'w') as f: # f.write(json_str) # 链接数据库 创建group表 conn = connect_database(database_name) c = conn.cursor() sql = "create table [{0}] (GROUP_ID VARCHAR(50) PRIMARY KEY, ITEM_LIST VARCHAR(255), " \ "ITEM_NUM INT(10),ELEMENT_LIST VARCHAR(255),ELEMENT_NUM INT(10))".format(table_name) c.execute(sql) for group in group_item_dt: group_element = [] item_list = group_item_dt[group] item_str = ','.join(item_list) for item in item_list: # print(item) item_info = get_item_info(database_name, 'item', item) element_list = item_info[1] element_list = element_list.split(',') # print(element_list) for element in element_list: if element not in group_element: group_element.append(element) element_str = ','.join(group_element) sql = "insert into [{0}] (GROUP_ID,ELEMENT_NUM,ELEMENT_LIST,ITEM_LIST,ITEM_NUM) values('%s',%d,'%s','%s',%d)".format(table_name) \ % (group, len(group_element), element_str, item_str, len(item_list)) c.execute(sql) conn.commit() conn.close() # write_group_info_into_database('sample', 'group', '../data/group.txt')
def get_2disease_shared_gene(database_name, table_name, disease1, disease2, primary_key): """ Query the shared gene information between disease1 and disease2 :param database_name: string The database to connect to :param table_name: string The database table being queried :param disease1: string The ICD10cm disease name being queried :param disease2: string The ICD10cm disease name being queried :param primary_key: string The primary key of database table :return: query_list Information in query_list represent disease1,disease2,shared_gene_list,shared_gene_num respectively. """ conn = connect_database(database_name) c = conn.cursor() sql = ("select * from {0} where {1} = '{2}' ".format( table_name, primary_key, disease1)) c.execute(sql) query1_info = c.fetchall() query1_info = query1_info[0] query1_1ist = list(query1_info) sql = ("select * from {0} where {1} = '{2}' ".format( table_name, primary_key, disease2)) c.execute(sql) query2_info = c.fetchall() query2_info = query2_info[0] query2_1ist = list(query2_info) genelist1 = query1_1ist[1] genelist1 = genelist1.split(',') geneset1 = set(genelist1) genelist2 = query2_1ist[1] genelist2 = genelist2.split(',') geneset2 = set(genelist2) shared_geneset = geneset1 & geneset2 shared_genelist = list(shared_geneset) num_of_shared_gene = len(shared_genelist) shared_genelist = ','.join(shared_genelist) query_list = list() query_list.append(disease1) query_list.append(disease2) query_list.append(shared_genelist) query_list.append(num_of_shared_gene) conn.close() return query_list
def insert_data(database_name, table_name): """ This function is used to insert data into database :param database_name: string The database to connect to :param table_name: string The database table being inserted :return: """ conn = connect_database(database_name) c = conn.cursor() if table_name is 'mesh_gene': update_info = input( "please enter insert information(DISEASE_ID,GENE_LIST,NUM_OF_GENE),split by ' ':" ) update_info = update_info.split(' ') disease_id = update_info[0] gene_list = update_info[1] num_of_gene = update_info[2] sql = ( "insert into {0} (DISEASE_ID,GENE_LIST,NUM_OF_GENE) values ({1},{2},{3})" .format(table_name, disease_id, gene_list, num_of_gene)) c.execute(sql) conn.commit() elif table_name is ('ICD10CMinfo'): update_info = input( "please enter insert information(DISEASE_ID,MESH_LIST,DO_LIST,GENE_LIST),split by ' ':" ) update_info = update_info.split(' ') disease_id = update_info[0] mesh_list = update_info[1] do_list = update_info[2] gene_list = update_info[3] sql = ( "insert into {0} (DISEASE_ID,MESH_LIST,DO_LIST,GENE_LIST) values ({1},{2},{3},{4})" .format(table_name, disease_id, mesh_list, do_list, gene_list)) c.execute(sql) conn.commit() elif table_name is ('ICD10CM_gene'): update_info = input( "please enter insert information(DISEASE_ID,GENE_LIST,NUM_OF_GENE),split by ' ':" ) update_info = update_info.split(' ') disease_id = update_info[0] gene_list = update_info[1] num_of_gene = update_info[2] sql = ( "insert into {0} (DISEASE_ID,GENE_LIST,NUM_OF_GENE) values ({1},{2},{3})" .format(table_name, disease_id, gene_list, num_of_gene)) c.execute(sql) conn.commit() elif table_name is ('group_info'): update_info = input( "please enter insert information(GROUP_ID,DISEASE_LIST,NUM_OF_DISEASE),split by ' ':" ) update_info = update_info.split(' ') group_id = update_info[0] disease_list = update_info[1] num_of_disease = update_info[2] sql = ( "insert into {0} (GROUP_ID,DISEASE_LIST,NUM_OF_DISEASE) values ({1},{2},{3})" .format(table_name, group_id, disease_list, num_of_disease)) c.execute(sql) conn.commit() else: print("no such table exists in '{0}' database".format(database_name)) conn.close() return 0
def update_data(database_name, table_name): """ This function is used to update information in database :param database_name: string The database to connect to :param table_name: string The database table being updated :return: """ conn = connect_database(database_name) c = conn.cursor() if table_name is 'mesh_gene': update_info = input( "please enter update information(DISEASE_ID,GENE_LIST,NUM_OF_GENE),split by ' ':" ) update_info = update_info.split(' ') disease_id = update_info[0] gene_list = update_info[1] num_of_gene = update_info[2] sql = ( "update {0} set GENE_LIST ='{1}',NUM_OF_GENE = '{2}' where DISEASE_ID = {3}" .format(table_name, gene_list, num_of_gene, disease_id)) c.execute(sql) conn.commit() elif table_name is ('ICD10CMinfo'): update_info = input( "please enter update information(DISEASE_ID,MESH_LIST,DO_LIST,GENE_LIST),split by ' ':" ) update_info = update_info.split(' ') disease_id = update_info[0] mesh_list = update_info[1] do_list = update_info[2] gene_list = update_info[3] sql = ( "update {0} set MESH_LIST ='{1}',DO_LIST = '{2}',GENE_LIST = '{3}' " "where DISEASE_ID = {4}".format(table_name, mesh_list, do_list, gene_list, disease_id)) c.execute(sql) conn.commit() elif table_name is ('ICD10CM_gene'): update_info = input( "please enter update information(DISEASE_ID,GENE_LIST,NUM_OF_GENE),split by ' ':" ) update_info = update_info.split(' ') disease_id = update_info[0] gene_list = update_info[1] num_of_gene = update_info[2] sql = ("update {0} set GENE_LIST ='{1}',NUM_OF_GENE = '{2}' " "where DISEASE_ID = {3}".format(table_name, gene_list, num_of_gene, disease_id)) # c.execute(sql) conn.commit() elif table_name is ('group_info'): update_info = input( "please enter update information(GROUP_ID,DISEASE_LIST,NUM_OF_DISEASE),split by ' ':" ) update_info = update_info.split(' ') group_id = update_info[0] disease_list = update_info[1] num_of_disease = update_info[2] sql = ("update {0} set DISEASE_LIST ='{1}',NUM_OF_DISEASE = '{2}' " "where GROUP_ID = {3}".format(table_name, disease_list, num_of_disease, group_id)) c.execute(sql) conn.commit() else: print("no such table exists in '{0}' database".format(database_name)) conn.close() return 0
def icd10cm_gene_info_json_into_database(database_name, table_name, json_file_path): """ Transfer icd10cm and gene data from JSON file into database JSON file organizes data as directory format: e.g: { ... "ICD10CM:E88.9": { "mesh": [ "MESH:D008659" ], "do": [ "DOID:0014667" ], "gene_list": [ "3630", "2110", "5468", "1828", "1401", "196", "2101", "3356", "51733" ] }, ... } Each key in directory is a ICD10CM disease id, and it contains gene_list and numbers_of_gene Each ICD10CM disease id mappings several gene. :param database_name: string The database to connect to :param table_name: The database table name that store GDAs data :param json_file_path: The path of JSON file :return: """ conn = connect_database(database_name) c = conn.cursor() sql = "create table {0} (DISEASE_ID VARCHAR(50) PRIMARY KEY , " \ "GENE_LIST VARCHAR(255), NUM_OF_GENE INT(10))".format(table_name) c.execute(sql) with open(json_file_path) as f: file = json.load(f) for disease in file: gene_info = file[disease]['gene_list'] gene_str = ','.join(gene_info) # 用","连接元素,构建成新的字符串 sql = "insert into {0}(DISEASE_ID, GENE_LIST, NUM_OF_GENE) values ('%s','%s',%d)".format(table_name) \ % (disease, gene_str, len(file[disease]['gene_list'])) c.execute(sql) conn.commit() conn.close()
def write_group_mathur_max_into_database(database_name, table_name, group_mathur_table_name): """ This function is used to write mathur number of disease group into database table :param database_name: string The database to connect to :param table_name: :param group_mathur_table_name: :return: """ conn = connect_database(database_name) c = conn.cursor() sql = "create table {0} (GROUP_ID VARCHAR (50) PRIMARY key , MATHUR_NUM VARCHAR (10))".format( group_mathur_table_name) c.execute(sql) sql = ("select * from {}".format(table_name)) c.execute(sql) query_list = c.fetchall() all_gene_num = get_all_gene_num(database_name, "mesh_gene") for group_info1 in query_list: max_mathur = 0 for group_info2 in query_list: # if group_info1[0] is not group_info2[0]: group_a_name = group_info1[0] group_x_name = group_info2[0] group_a_genelist = get_icd_diseasegroup_geneinfo( database_name, table_name, "GROUP_ID", group_a_name)[1] group_x_genelist = get_icd_diseasegroup_geneinfo( database_name, table_name, "GROUP_ID", group_x_name)[1] group_a_genelistlen = get_icd_diseasegroup_geneinfo( database_name, table_name, "GROUP_ID", group_a_name)[2] group_x_genelistlen = get_icd_diseasegroup_geneinfo( database_name, table_name, "GROUP_ID", group_x_name)[2] group_a_genelist = group_a_genelist.split(',') set_list1 = set(group_a_genelist) group_x_genelist = group_x_genelist.split(',') set_list2 = set(group_x_genelist) set1_and_set2_list = set_list1 & set_list2 # the intersection of set A and set B set1_and_set2_list_len = len( set1_and_set2_list ) # the size of the intersection of set A and set B mathur_num = (set1_and_set2_list_len / (group_a_genelistlen + group_x_genelistlen - set1_and_set2_list_len)) / \ ((group_a_genelistlen / all_gene_num) * (group_x_genelistlen / all_gene_num)) print(mathur_num) if mathur_num > max_mathur: max_mathur = mathur_num print('\t') sql = "insert into {0} (GROUP_ID, MATHUR_NUM) values ('%s', '%s')".format( group_mathur_table_name) % (group_a_name, max_mathur) c.execute(sql) conn.commit() conn.close()
def write_indi_mathur_max_into_database(database_name, table_name, mathur_table_name): """ This function is used to write mathur number of individual disease into database table :param database_name: string The database to connect to :param table_name: string The database table to be queried :param mathur_table_name: string The database table used to store mathur data :return: """ conn = connect_database(database_name) c = conn.cursor() sql = "create table {0} (DISEASE_ID VARCHAR(50) PRIMARY KEY , MATHUR_NUM VARCHAR(10))".format( mathur_table_name) c.execute(sql) sql = ("select * from {}".format(table_name)) c.execute(sql) query_list = c.fetchall() all_gene_num = get_all_gene_num(database_name, table_name) for disease_info1 in query_list: max_mathur = 0 # 代表与disease1相似度最大的疾病,相似度的值 for disease_info2 in query_list: disease_a_name = disease_info1[0] disease_x_name = disease_info2[0] disease_a_genelist = disease_info1[1] disease_x_genelist = disease_info2[1] disease_a_genelistlen = disease_info1[2] disease_x_genelistlen = disease_info2[2] disease_a_genelist = disease_a_genelist.split(',') set_list1 = set(disease_a_genelist) disease_x_genelist = disease_x_genelist.split(',') set_list2 = set(disease_x_genelist) set1_and_set2_list = set_list1 & set_list2 # the intersection of set A and set B # print(set1_and_set2_list) set1_and_set2_list_len = len( set1_and_set2_list ) # the size of the intersection of set A and set B mathur_num = (set1_and_set2_list_len / (disease_a_genelistlen + disease_x_genelistlen - set1_and_set2_list_len)) /\ ((disease_a_genelistlen / all_gene_num) * (disease_x_genelistlen / all_gene_num)) print(mathur_num) if mathur_num > max_mathur: max_mathur = mathur_num print('\t') sql = "insert into {0} (DISEASE_ID, MATHUR_NUM) values ('%s','%s')".format( mathur_table_name) % (disease_a_name, max_mathur) c.execute(sql) conn.commit() conn.close()
def cal_2indimathurscore(database_name, table_name, primary_key_name, individual_name1, individual_name2): """ Calculate similarity between two diseases based on Mathur's score method :param database_name: string The database to connect to :param table_name: string The database table to be queried :param primary_key_name: string The primary key used to query disease information in database :param individual_name1: string A disease for calculating similarity :param individual_name2: string A disease for calculating similarity :return: mathur_score The Mathur's score method similarity between two disease """ conn = connect_database(database_name) c = conn.cursor() if table_name is "mesh_gene": sql = ("select * from mesh_mathur where DISEASE_ID = '{0}'".format(individual_name1)) # print(sql) c.execute(sql) mathur_info1 = c.fetchall() mathur_info1 = mathur_info1[0] max_sim_indi1_i = mathur_info1[1] max_sim_indi1_i = float(max_sim_indi1_i) # print(mathur_info1) # print(max_sim_indi1_i) sql = ("select * from mesh_mathur where DISEASE_ID = '{0}'".format(individual_name2)) # print(sql) c.execute(sql) mathur_info2 = c.fetchall() mathur_info2 = mathur_info2[0] max_sim_indi2_i = mathur_info2[1] max_sim_indi2_i = float(max_sim_indi2_i) # print(mathur_info2) # print(max_sim_indi2_i) elif table_name is "ICD10CM_gene": sql = ("select * from ICD10cm_mathur where DISEASE_ID = '{0}'".format(individual_name1)) # print(sql) c.execute(sql) mathur_info1 = c.fetchall() # print(mathur_info1) mathur_info1 = mathur_info1[0] max_sim_indi1_i = mathur_info1[1] max_sim_indi1_i = float(max_sim_indi1_i) # print(mathur_info1) # print(max_sim_indi1_i) sql = ("select * from ICD10cm_mathur where DISEASE_ID = '{0}'".format(individual_name2)) c.execute(sql) mathur_info2 = c.fetchall() mathur_info2 = mathur_info2[0] max_sim_indi2_i = mathur_info2[1] max_sim_indi2_i = float(max_sim_indi2_i) # print(mathur_info2) # print(max_sim_indi2_i) mathur = cal_2indimathur(database_name, table_name, primary_key_name, individual_name1, individual_name2) mathur_score = mathur / ((max_sim_indi1_i + max_sim_indi2_i) / 2) mathur_score = '%.5f' % mathur_score conn.close() return mathur_score
def cal_2rdgs(database_name, table_name, primary_key, group_name1, group_name2): """ Calculate the similarity between two disease groups based on R-DGS :param database_name: string The database to connect to :param table_name: string The database table being queried :param primary_key: string The primary key of database table :param group_name1: string A disease group for calculating similarity :param group_name2: string A disease group for calculating similarity :return: sim The similarity between two disease groups """ ################################################################ # conect to the database and return the query information ################################################################ conn = connect_database(database_name) c = conn.cursor() sql1 = ("select * from {0} where {1} = '{2}' ".format( table_name, primary_key, group_name1)) sql2 = ("select * from {0} where {1} = '{2}' ".format( table_name, primary_key, group_name2)) c.execute(sql1) infolist1 = c.fetchall() c.execute(sql2) infolist2 = c.fetchall() # print(infolist1) # print(infolist2) ####################################################################### # find the gene number of each disease group(group1_item_num,group2_item_num) ######################################################################## group_1_item_num = get_icd_diseasegroup_geneinfo(database_name, table_name, primary_key, group_name1)[2] group_2_item_num = get_icd_diseasegroup_geneinfo(database_name, table_name, primary_key, group_name2)[2] # print(group_1_item_num) # print(group_2_item_num) # print(get_icd_diseasegroup_geneinfo(database_name, table_name, primary_key, group_name1)[1]) # print(get_icd_diseasegroup_geneinfo(database_name, table_name, primary_key, group_name2)[1]) ############################################################### # find the gene number of all the GDAs ############################################################### all_gene_num = get_all_gene_num(database_name, "mesh_gene") # print(all_gene_num) ############################################################### # bulid the random model of GROUP_NAME1, GROUP_NAME2, calculate C_random ############################################################### c_random = (group_1_item_num * group_2_item_num) / all_gene_num # print(c_random) ############################################################### # calculate the gene number of (GROUP_NAME1 intersection GROUP_NAME2), calculate C_real ############################################################### c_real = get_2diseasegroup_shared_gene(database_name, table_name, group_name1, group_name2, primary_key)[3] # print(c_real) ############################################################### # calculate sij = c_real/c_random ############################################################### s = float(c_real) / float(c_random) ############################################################### # normalization Si,j by min-max normalization method ############################################################### min_score = 0 max_score = float(all_gene_num) / min(float(group_1_item_num), float(group_2_item_num)) # print(max_score) sim = (s - min_score) / (max_score - min_score) sim = '%.5f' % sim conn.close() return sim
def cal_2indicosine(database_name, table_name, primary_key_name, individual_name1, individual_name2): """ Calculate similarity between two diseases based on Cosine method :param database_name: string The database to connect to :param table_name: string The database table to be queried :param primary_key_name: string The primary key used to query disease information in database :param individual_name1: string A disease for calculating similarity :param individual_name2: string A disease for calculating similarity :return: cosθ The Cosine similarity between two disease """ conn = connect_database(database_name) c = conn.cursor() sql1 = ("select * from {0} where {1} = '{2}'".format(table_name, primary_key_name, individual_name1)) sql2 = ("select * from {0} where {1} = '{2}'".format(table_name, primary_key_name, individual_name2)) c.execute(sql1) infolist1 = c.fetchall() c.execute(sql2) infolist2 = c.fetchall() ######################################################## # 原来的部分,只看其出现没出现,设置0/1,不关心其出现的次数 ######################################################## # for member1 in infolist1: # tuple_list1 = member1[1] # tuple_list1 = tuple_list1.split(',') # set_list1 = set(tuple_list1) # 集合A # # print(len(set_list1)) # for member2 in infolist2: # tuple_list2 = member2[1] # tuple_list2 = tuple_list2.split(',') # set_list2 = set(tuple_list2) # 集合B # # print(len(set_list2)) # set1_union_set2_list = set_list1 | set_list2 for member1 in infolist1: vector_list1 = member1[1] vector_list1 = vector_list1.split(',') set_list1 = set(vector_list1) for member2 in infolist2: vector_list2 = member2[1] vector_list2 = vector_list2.split(',') set_list2 = set(vector_list2) set1_union_set2_list = set_list1 | set_list2 vector1 = [] vector2 = [] for item in set1_union_set2_list: num = 0 for key in vector_list1: if key == item: num = num + 1 vector1.append(num) for item in set1_union_set2_list: num = 0 for key in vector_list2: if key == item: num = num + 1 vector2.append(num) # print(set1_union_set2_list) # print(len(set1_union_set2_list)) # samelist = [] # for key1 in set_list1: # for key2 in set_list2: # if key1 == key2: # samelist.append(key1) # print(samelist) ######################################################## # 原来的部分,只看其出现没出现,设置0/1,不关心其出现的次数 ######################################################## # vector1 = [] # vector2 = [] # # for item in set1_union_set2_list: # if item in set_list1: # vector1.append(1) # elif item not in set_list1: # vector1.append(0) # if item in set_list2: # vector2.append(1) # elif item not in set_list2: # vector2.append(0) # print(vector1) # print(vector2) vectorlen = len(set1_union_set2_list) a = b = c = 0 for i in range(vectorlen): a = a + vector1[i] * vector2[i] b = b + vector1[i] * vector1[i] c = c + vector2[i] * vector2[i] cosθ = a / ((b ** 0.5) * (c ** 0.5)) cosθ = '%.5f' % cosθ conn.close() return cosθ