def store_kplets_pile(kplets_pile, cdd2id, file2id): _sql_kplet = """insert ignore into bacteria_3plets (kplet_1, kplet_2, kplet_3) values \n""" _sql_kplet_file = """insert ignore into bacteria_3plets_win10 (kplet_id, file_id) values \n""" for (kplets, fname) in kplets_pile: for kplet in kplets: kplet = list(kplet) kplet.sort() kplet = tuple([int(cdd2id[k]) for k in kplet]) _sql_kplet += """(%d, %d, %d),\n""" % kplet _sql_kplet_file += ( """((select id from bacteria_3plets where """ + """kplet_1=%d and kplet_2=%d and kplet_3=%d),""" + """%d),\n""") % (kplet + (int(file2id[fname]), )) _sql_kplet = _sql_kplet[:-2] _sql_kplet += ';' _sql_kplet_file = _sql_kplet_file[:-2] _sql_kplet_file += ';' _db = DbClass() _db.cmd = _sql_kplet _db.execute() _db.commit() _db.cmd = _sql_kplet_file _db.execute() _db.commit()
def get_report_kplets(id2cdd, limit_to=500): _db = DbClass() _db.cmd = """SET group_concat_max_len=15000000""" _db.execute() _db.cmd = """select cp.*, count(distinct cwf.id) as cnt, group_concat(cwf.name) as files from crispr_5plets cp inner join crispr_5plets_wgs cpw on cp.id = cpw.kplet_id inner join crispr_wgs_files cwf on cpw.file_id = cwf.id group by cp.id having count(distinct cwf.id)>1 order by cnt desc limit 0, %d""" % limit_to out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = ([id2cdd[int(_id)] for _id in row[1:6]]) if len(set(kplet_codes)) != 5: continue count = row[6] files = row[7].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) return out_list
def get_report_kplets(id2cdd, limit_to=500, load_locations=None): _db = DbClass() _db.cmd = """SET group_concat_max_len=1500000""" _db.execute() _db.cmd = """select ap.* ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an from bacteria_3plets ap inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id inner join bacteria_win10_files awf on apw.file_id = awf.id inner join sources s on awf.source_id=s.id inner join weights w on w.genome_id=s.genome_id group by ap.id having count(distinct s.genome_id)>1 order by wgt desc limit 0, %d""" % limit_to out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = ([id2cdd[int(_id)] for _id in row[1:4]]) if len(set(kplet_codes)) != 3: continue count = row[4] weight = row[5] files = row[6].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) _path = neighborhoods_path() if load_locations: [kplet.load_locations(_path) for kplet in out_list] return out_list
def get_report_kplet(kplet_id, id2cdd, load_locations=None): _db = DbClass() _db.cmd = """SET group_concat_max_len=1500000;""" _db.execute() _db.cmd = """select ap.* ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an from bacteria_5plets ap inner join bacteria_5plets_win10 apw on ap.id = apw.kplet_id inner join bacteria_win10_files awf on apw.file_id = awf.id inner join sources s on awf.source_id=s.id inner join weights w on w.genome_id=s.genome_id where ap.id=%d group by ap.id""" % kplet_id out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = [id2cdd[int(_id)] for _id in row[1:6]] if len(set(kplet_codes)) != 5: continue count = row[6] files = row[8].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) _path = neighborhoods_path() if load_locations: [kplet.load_locations(_path) for kplet in out_list] return out_list[0]
def store_kplets_pile(kplets_pile, cdd2id, file2id): _sql_kplet = """insert ignore into bacteria_3plets (kplet_1, kplet_2, kplet_3) values \n""" _sql_kplet_file = """insert ignore into bacteria_3plets_win10 (kplet_id, file_id) values \n""" for (kplets, fname) in kplets_pile: for kplet in kplets: kplet = list(kplet) kplet.sort() kplet = tuple([int(cdd2id[k]) for k in kplet]) _sql_kplet += """(%d, %d, %d),\n""" % kplet _sql_kplet_file += ("""((select id from bacteria_3plets where """ + """kplet_1=%d and kplet_2=%d and kplet_3=%d),""" + """%d),\n""") % (kplet + (int(file2id[fname]),)) _sql_kplet = _sql_kplet[:-2] _sql_kplet += ';' _sql_kplet_file = _sql_kplet_file[:-2] _sql_kplet_file += ';' _db = DbClass() _db.cmd = _sql_kplet _db.execute() _db.commit() _db.cmd = _sql_kplet_file _db.execute() _db.commit()
def insert_source_duplets(kplets_pile, profile2id, source_id): sql_insert_kplet = """insert ignore into prok1402_all_adj_duplet (kplet_1, kplet_2, multidomain) values \n""" sql_insert_kplet_file = """insert into prok1402_all_adj_duplet_source (kplet_id, source_id) values \n""" for (duplet, fname, multidomain) in kplets_pile: terms = sorted(list(duplet)) terms = [profile2id[_] for _ in terms] sql_insert_kplet += """(%s, %s, %d),\n""" % (terms[0], terms[1], multidomain) sql_insert_kplet_file += """((select id from prok1402_all_adj_duplet where """ + \ """kplet_1=%s and kplet_2=%s and multidomain=%d), %d),\n""" % \ (terms[0], terms[1], multidomain, source_id) sql_insert_kplet_file = sql_insert_kplet_file[:-2] sql_insert_kplet_file += ";" sql_insert_kplet = sql_insert_kplet[:-2] sql_insert_kplet += ";" _db = DbClass() _db.cmd = sql_insert_kplet _db.execute() _db.cmd = sql_insert_kplet_file _db.execute() _db.commit()
def insert_duplets(kplets_pile, profile2id, file2id): sql_insert_kplet = """insert ignore into prok1402_baited_duplet (kplet_1, kplet_2) values \n""" sql_insert_kplet_file = """insert ignore into prok1402_baited_duplet_file (kplet_id, file_id) values \n""" for (duplet, fname) in kplets_pile: terms = sorted(list(duplet)) terms = [profile2id[_] for _ in terms] fname = os.path.basename(fname) file_id = file2id[fname] sql_insert_kplet += """(%s, %s),\n""" % (terms[0], terms[1]) sql_insert_kplet_file += """((select id from prok1402_baited_duplet where """ + \ """kplet_1=%s and kplet_2=%s), %d),\n""" % (terms[0], terms[1] ,file_id) sql_insert_kplet_file = sql_insert_kplet_file[:-2] sql_insert_kplet_file += ";" sql_insert_kplet = sql_insert_kplet[:-2] sql_insert_kplet += ";" _db = DbClass() _db.cmd = sql_insert_kplet _db.execute() _db.cmd = sql_insert_kplet_file _db.execute() _db.commit()
def get_report_kplets(id2cdd, limit_to=500): _db = DbClass() _db.cmd = """SET group_concat_max_len=1500000""" _db.execute() _db.cmd = """select cp.*, count(distinct cwf.id) as cnt, group_concat(cwf.name) as files from crispr_2plets cp inner join crispr_2plets_wgs cpw on cp.id = cpw.kplet_id inner join crispr_wgs_files cwf on cpw.file_id = cwf.id group by cp.id having count(distinct cwf.id)>1 order by cnt desc limit 0, %d""" % limit_to out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = ([id2cdd[int(_id)] for _id in row[1:3]]) if len(set(kplet_codes)) != 2: continue count = row[3] files = row[4].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) return out_list
def insert_kplet(profiles): profile2id = t.map_profile2id(profiles) _db = DbClass() sql_cmd = """insert into archea_4plets (kplet_1, kplet_2, kplet_3, kplet_4) values (%d, %d, %d, %d)""" sql_cmd = sql_cmd % (profile2id[profiles[0]], profile2id[profiles[1]], profile2id[profiles[2]], profile2id[profiles[3]]) _db.cmd = sql_cmd _db.execute()
def get_multiple_kplets(): _db = DbClass() _db.cmd = "SET group_concat_max_len = 100000;" _db.execute() _db.cmd = """ select ap.id, count(*) cnt, group_concat(convert(apw.file_id, char(15))) as file_ids from bacteria_3plets ap inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id group by ap.id having count(*)>1 order by cnt desc""" return _db.retrieve()
def store_kplets_pile(prefix, kplets_pile, profile2id, file2id): kplet_table_name = "%s_3plets" % prefix kplet_table_file_name = "%s_3plets_files" % prefix _sql_kplet = """insert ignore into %s (kplet_1, kplet_2, kplet_3) values \n""" % kplet_table_name _sql_kplet_file = """insert ignore into %s (kplet_id, file_id) values \n""" % kplet_table_file_name for (kplets, fname) in kplets_pile: for kplet in kplets: kplet = list(kplet) kplet.sort() kplet = tuple([int(profile2id[k]) for k in kplet]) _sql_kplet += """(%d, %d, %d),\n""" % kplet _sql_kplet_file += ( """((select id from %s where """ + """kplet_1=%d and kplet_2=%d and kplet_3=%d),""" + """%d),\n""") % ((kplet_table_name, ) + kplet + (int(file2id[fname]), )) _sql_kplet = _sql_kplet[:-2] _sql_kplet += ';' _sql_kplet_file = _sql_kplet_file[:-2] _sql_kplet_file += ';' _db = DbClass() _db.cmd = _sql_kplet _db.execute() _db.commit() _db.cmd = _sql_kplet_file _db.execute() _db.commit()
def get_report_kplets(limit_to=300, count_filter=1, load_locations=None): _db = DbClass() _db.cmd = """SET group_concat_max_len=15000""" _db.execute() _db.cmd = """select apc.*, s1.cnt, s1.wgt, s1.an from ( select ap.id ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an from archea_2plets ap inner join archea_2plets_win10 apw on ap.id = apw.kplet_id inner join archea_win10_files awf on apw.file_id = awf.id inner join sources s on awf.source_id=s.id inner join weights w on w.genome_id=s.genome_id group by ap.id having count(distinct s.genome_id) > %d ) s1 inner join archea_2plets_codes apc on s1.id=apc.id order by s1.wgt desc limit 0,%d""" % (count_filter, limit_to) out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = (row[1:3]) if len(set(kplet_codes)) != 2: continue count = row[3] files = row[5].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) if load_locations: _path = neighborhoods_path() [kplet.load_locations(_path) for kplet in out_list] return out_list
def store_kplets(kplets, fname): _sql_kplet = """insert ignore into bacteria_4plets (kplet_1, kplet_2, kplet_3, kplet_4) values \n""" _sql_kplet_file = """insert ignore into bacteria_4plets_win10 (kplet_id, file_id) values \n""" for kplet in kplets: kplet = list(kplet) kplet.sort() kplet = tuple(kplet) _sql_kplet += ( """((select id from cdd_profiles where code='%s'),""" + """(select id from cdd_profiles where code='%s'),""" + """(select id from cdd_profiles where code='%s'),""" + """(select id from cdd_profiles where code='%s')),\n""") % kplet _sql_kplet_file += ( """((select id from bacteria_4plets where """ + """kplet_1=(select id from cdd_profiles where code='%s') """ + """and kplet_2=(select id from cdd_profiles where code='%s') """ + """and kplet_3=(select id from cdd_profiles where code='%s') """ + """and kplet_4=(select id from cdd_profiles where code='%s')),""" + """(select id from bacteria_win10_files where name='%s')),\n""" ) % (kplet + (fname, )) _sql_kplet = _sql_kplet[:-2] _sql_kplet += ';' _sql_kplet_file = _sql_kplet_file[:-2] _sql_kplet_file += ';' _db = DbClass() _db.cmd = _sql_kplet _db.execute() _db.cmd = _sql_kplet_file _db.execute()
def store_kplets_pile(prefix, kplets_pile, profile2id, file2id): kplet_table_name = "%s_4plets" % prefix kplet_table_file_name = "%s_4plets_files" % prefix _sql_kplet = """insert ignore into %s (kplet_1, kplet_2, kplet_3, kplet_4) values \n""" % kplet_table_name _sql_kplet_file = """insert ignore into %s (kplet_id, file_id) values \n""" % kplet_table_file_name for (kplets, fname) in kplets_pile: for kplet in kplets: kplet = list(kplet) kplet.sort() kplet = tuple([int(profile2id[k]) for k in kplet]) _sql_kplet += """(%d, %d, %d, %d),\n""" % kplet _sql_kplet_file += ("""((select id from %s where """ + """kplet_1=%d and kplet_2=%d and kplet_3=%d and kplet_4=%d),""" + """%d),\n""") % ((kplet_table_name,) + kplet + (int(file2id[fname]),)) _sql_kplet = _sql_kplet[:-2] _sql_kplet += ';' _sql_kplet_file = _sql_kplet_file[:-2] _sql_kplet_file += ';' _db = DbClass() _db.cmd = _sql_kplet _db.execute() _db.commit() _db.cmd = _sql_kplet_file _db.execute() _db.commit()
total_cnt = 0 chunk_size = 10000 chunks = [ table_rows[i:i + chunk_size] for i in range(0, len(table_rows), chunk_size) ] for row in table_rows: (_cnt, _kplet_id, _file_id) = row assert _cnt > 1 _db.cmd = """delete from %s where kplet_id=%d and file_id=%d""" % ( table_name, _kplet_id, _file_id) _db.execute() _db.commit() _db.cmd = """insert into %s(kplet_id, file_id) values (%d, %d)""" % ( table_name, _kplet_id, _file_id) _db.execute() _db.commit() total_cnt += 1 if total_cnt % 10000 == 0: print total_cnt print total_cnt print "Finished:", table_name print print
print "Duplicates saved in file:", fname continue sys.exit() print("Starting to remove duplicates") total_cnt = 0 chunk_size = 10000 chunks = [table_rows[i: i+chunk_size] for i in range(0, len(table_rows), chunk_size)] for row in table_rows: (_cnt, _kplet_id, _file_id) = row assert _cnt > 1 _db.cmd = """delete from %s where kplet_id=%d and file_id=%d""" % (table_name, _kplet_id, _file_id) _db.execute() _db.commit() _db.cmd = """insert into %s(kplet_id, file_id) values (%d, %d)""" % (table_name, _kplet_id, _file_id) _db.execute() _db.commit() total_cnt += 1 if total_cnt % 10000 ==0: print total_cnt print total_cnt print "Finished:", table_name print print