def main(directory, pdbid, r_ch, l_ch, input): wd = os.path.abspath(directory) subdir = os.path.join(wd, pdbid) complexid = "{0}{1}{2}".format(pdbid, r_ch, l_ch) complexdb = os.path.join(wd, "scores_{0}.db".format(complexid)) if not shared.missing(complexdb): return # Initialize window data window_data = pd.read_csv(input) windows = list() fragments = list() # Windows are in 2nd-level subdirectories for window_dir in glob.iglob(os.path.join(subdir, "*")): # Skip 2nd-level files if not os.path.isdir(window_dir): continue subdir, windowindex = os.path.split(window_dir) windowindex = windowindex.lower().replace(pdbid.lower(), "") try: windowindex = int(windowindex) except Exception: raise CreateDatabaseError("Expected window directory format $PDBID$WINDOWINDEX (e.g. 1bfg1)") window_row = dict(windowindex=windowindex, window_wd=window_dir) windows.append(window_row) # Fragments are in 3rd-level subdirectories for fragment_dir in glob.iglob(os.path.join(window_dir, "*")): # Skip 3rd-level files if not os.path.isdir(fragment_dir): continue window_dir, fragmentindex = os.path.split(fragment_dir) fragment_row = dict(windowindex=windowindex, fragmentindex=fragmentindex) fragments.append(fragment_row) window_df = pd.merge(window_data, pd.DataFrame(windows), on="windowindex") # Create fragment database with shared.new_conn(complexdb) as windowconn: cursor = windowconn.cursor() # Insert windows into database cursor.execute(window_schema) w_insert = shared.create_insert_statement("window", window_df.columns) cursor.executemany(w_insert, window_df.to_dict("records")) # Insert fragments into database cursor.execute(fragment_schema) insert = shared.create_insert_statement("fragment", ["windowindex", "fragmentindex"]) cursor.executemany(insert, fragments)
def modeldist_sql(prev_window, cur_window, mode, tablename_fmt, allowed_columns, allowed_fmt, disallowed_columns, disallowed_fmt, index_fmt, **kwargs): mode = mode[0].lower() tablename = tablename_fmt.format(prev_window=prev_window, cur_window=cur_window) schema_fmt = dict(a=allowed_fmt, d=disallowed_fmt)[mode] schema = schema_fmt.format(tablename=tablename) columns = dict(a=allowed_columns, d=disallowed_columns)[mode] insert = shared.create_insert_statement(tablename=tablename, columns=columns) index = index_fmt.format(tablename=tablename, prev_window=prev_window) return dict(schema=schema, insert=insert, index=index)
def prepare_sql(self, complexdb): sql_kwargs = dict( model_tablename="allmodel", model_choose_tablename="model", model_choose_columns=["modelid", "di", "coordinates"], ) model_choose_schema = """ CREATE TABLE IF NOT EXISTS {model_choose_tablename} ( {model_choose_columns[0]} INTEGER PRIMARY KEY NOT NULL, {model_choose_columns[1]} REAL NOT NULL, {model_choose_columns[2]} TEXT NOT NULL, FOREIGN KEY({model_choose_columns[0]}) REFERENCES {model_tablename}({model_choose_columns[0]}) )""".format(**sql_kwargs) model_choose_insert = shared.create_insert_statement( sql_kwargs['model_choose_tablename'], sql_kwargs['model_choose_columns']) n_model_q_fmt = """SELECT count(*) AS ct FROM {model_choose_tablename} JOIN {model_tablename} USING({model_choose_columns[0]}) JOIN fragment f USING(fragmentindex, windowindex) WHERE windowindex={{windowindex}} """.format(**sql_kwargs) window_data_q_fmt = """SELECT {model_choose_columns[0]}, modelindex, fragmentindex, windowindex, m.dfire as dfire, itscore FROM {model_tablename} m JOIN fragment f USING(fragmentindex, windowindex) WHERE windowindex={{windowindex}} AND m.dfire IS NOT NULL """.format(**sql_kwargs) n_paths_q = "SELECT count(*) FROM {model_tablename}".format( **sql_kwargs) with shared.write_conn(complexdb) as conn: curs = conn.cursor() curs.execute(model_choose_schema) return dict( model_insert=model_choose_insert, window_data_q_fmt=window_data_q_fmt, n_model_q_fmt=n_model_q_fmt, n_paths=n_paths_q, )
def make_sql(self, complexid, nwindows, limit=None): path_select_fmt = '''SELECT pathsid, {windows} FROM paths{nwindows} ''' path_count_fmt = "SELECT count(*) FROM paths{nwindows}" if limit: limit_clause = "\nLIMIT {0}".format(limit) path_select_fmt += limit_clause path_count_fmt += limit_clause cluster_tablename = "clusters{nwindows}".format(nwindows=nwindows) cluster_columns = ["pathsid", "cid", "is_medoid", "clustersize"] cluster_kwargs = dict(nwindows=nwindows, cluster_tablename=cluster_tablename, cluster_columns=cluster_columns, paths_tablename="paths{}".format(nwindows)) path_count_sql = path_count_fmt.format(**cluster_kwargs) cluster_count_sql = """SELECT count(*) FROM {cluster_tablename} """.format(**cluster_kwargs) cluster_schemas = """ CREATE TABLE {cluster_tablename} ( {cluster_columns[0]} INTEGER PRIMARY KEY NOT NULL, {cluster_columns[1]} INTEGER NOT NULL, {cluster_columns[2]} INTEGER NOT NULL, {cluster_columns[3]} INTEGER, FOREIGN KEY({cluster_columns[0]}) REFERENCES {paths_tablename}({cluster_columns[0]}) )""".format(**cluster_kwargs).split(";") cluster_insert = shared.create_insert_statement( cluster_tablename, cluster_columns[:-1]) path_select_sql = path_select_fmt.format(windows=", ".join( "window{0}".format(x) for x in range(nwindows)), **cluster_kwargs) logging.debug("\n%s", path_select_sql) return dict(path_count=path_count_sql, cluster_count=cluster_count_sql, path_select=path_select_sql, cluster_schemas=cluster_schemas, cluster_insert=cluster_insert)
def load(self, complexdb, complexname, receptor_chain, ligand_chain, **kwargs): """ Load model scores. """ fragmentselect = """ SELECT windowindex, fragmentindex, window_wd FROM fragment JOIN window USING(windowindex)""" model_ct_select = "SELECT COUNT(*) FROM allmodel WHERE fragmentindex=:fragmentindex AND windowindex=:windowindex" modelcolumns = [ "modelindex", "dfire", "itscore", "fragmentindex", "windowindex" ] modelinsert = shared.create_insert_statement("allmodel", modelcolumns) pdbid = "{0}{1}{2}".format(complexname, receptor_chain, ligand_chain) logging.debug(pdbid) receptor_chain = receptor_chain.lower()[:1] ligand_chain = ligand_chain.lower() with shared.write_conn(complexdb) as conn: curs = conn.cursor() curs.execute(model_schema) ## INSERT MODELS fragment_rows = shared.conn_to_pandas(fragmentselect, conn) fragment_rows = fragment_rows.to_dict("records") for frag_row in fragment_rows: windowindex = frag_row['windowindex'] fragmentindex = frag_row['fragmentindex'] frag_wd = os.path.join(frag_row['window_wd'], str(fragmentindex)) itscorefile = os.path.join(frag_wd, shared.itscore_file) model_itscores = shared.read_itscore(itscorefile, kind="model") nmodels = len(model_itscores) nmodelrows = curs.execute( model_ct_select, dict(fragmentindex=fragmentindex, windowindex=windowindex)).fetchone()[0] if nmodelrows < nmodels: # Drop partial table if nmodelrows: logging.debug("Dropping partial model table") curs.execute("DELETE FROM model") nitscore = len(model_itscores) dfirefile = os.path.join(frag_wd, shared.goap_file) model_dfires = shared.read_dfire_from_goap(dfirefile, kind="model") modelrows = pd.merge(model_itscores, model_dfires, on="modelindex", how="left") ndi = len(modelrows) if len(set((nitscore, ndi))) != 1: logging.error("ITScores: %s", nitscore) logging.error("IT lj dfire: %s", ndi) raise LoadModelScoresError("Score number mismatch") modelrows['fragmentindex'] = fragmentindex modelrows['windowindex'] = windowindex curs.executemany(modelinsert, modelrows.to_dict("records"))
def make_paths(self, complexid, nwindows, batchsize): """Make paths for specific window size""" global total_paths global skipped_paths cur_window = nwindows - 1 n_edges = cur_window prev_window = cur_window - 1 n_prev_edges = prev_window out_db_file = self.out_db_file in_db_file = self.in_db_fmt.format(cur_window=cur_window) score_db_file = self.score_db_file out_tablename = self.out_tablename_fmt.format(nwindows=nwindows) out_column_list = ["window%s" % x for x in range(nwindows)] out_schema = [ "CREATE TABLE {tablename} (".format(tablename=out_tablename) ] out_schema.append("pathsid INTEGER PRIMARY KEY NOT NULL,") out_schema.extend("%s INTEGER NOT NULL," % w_col for w_col in out_column_list) out_schema.extend("%s REAL NOT NULL," % col for col in self.path_score_columns) out_schema.append( "timestamp DATE DEFAULT (datetime('now', 'localtime')));") out_schema = "\n".join(out_schema) logging.debug("Out schema: %s", out_schema) out_column_list.extend(self.path_score_columns) insert_sql = shared.create_insert_statement(tablename=out_tablename, columns=out_column_list) logging.debug("Out db: %s", out_db_file) logging.debug("Insert SQL: %s", insert_sql) if nwindows == 2: start_db_file = self.in_db_fmt.format(cur_window=1) start_table = self.in_tablename_fmt.format(prev_window=0, cur_window=1) window_columns = ["window%s" % (x) for x in range(2)] start_columns = window_columns + self.pair_score_columns edgescore_index = start_columns.index("edgescore") start_query = """ SELECT modela AS {windows[0]}, modelb AS {windows[1]}, {score_columns} FROM {tablename} """.format(tablename=start_table, windows=window_columns, score_columns=", ".join(self.pair_score_columns)) elif nwindows == 3: start_db_file = self.in_db_fmt.format(cur_window=1) start_table = self.in_tablename_fmt.format(prev_window=0, cur_window=1) window_columns = ["window%s" % (x) for x in range(2)] start_columns = window_columns + self.pair_score_columns edgescore_index = start_columns.index("edgescore") start_query = """ SELECT modela AS {windows[0]}, modelb AS {windows[1]}, {score_columns} FROM {tablename} """.format(tablename=start_table, windows=window_columns, score_columns=", ".join(self.pair_score_columns)) else: start_db_file = out_db_file start_table = self.out_tablename_fmt.format(nwindows=cur_window) window_columns = ["window%s" % (x) for x in range(cur_window)] start_columns = window_columns + self.path_score_columns edgescore_index = start_columns.index("edgescores") start_query = "SELECT {columns} FROM {tablename}".format( columns=", ".join(start_columns), tablename=start_table) cluster = self.cluster_tablename_fmt.format(nwindows=cur_window) start_query += " JOIN {clustertable} USING (pathsid) WHERE is_medoid=1".format( clustertable=cluster) logging.debug("Starting path db: %s", start_db_file) logging.debug("Starting paths query: %s", start_query) logging.debug("Edgescore index: %s", edgescore_index) logging.debug("Current path db: %s", in_db_file) new_window = "window{cur_window}".format(cur_window=cur_window) new_edgescore_index = self.pair_score_columns.index("edgescore") + 1 child_q_fmt = """SELECT modelb AS {new_window}, {pair_score_columns} FROM modeldist{prev_window}{cur_window} WHERE modela={{path[{prev_window}]}}""".format( new_window=new_window, cur_window=cur_window, prev_window=prev_window, pair_score_columns=", ".join(self.pair_score_columns)) for prior_window in range(prev_window): child_q_fmt += """ AND window{cur_window} NOT IN (SELECT modelb FROM modeldist{prior_window}{cur_window} WHERE modela={{path[{prior_window}]}})""".format( cur_window=cur_window, prior_window=prior_window) if nwindows > 2: logging.debug("Next window query: %s", child_q_fmt) logging.debug("Edgescore index: %s", new_edgescore_index) with shared.ro_conn(start_db_file) as start_db: previous_path_list = list(start_db.cursor().execute(start_query)) logging.debug("%s previous paths", len(previous_path_list)) # Get single model scores score_q = "SELECT modelid, {model_score_columns} FROM model JOIN allmodel USING(modelid)".format( model_score_columns=", ".join(self.model_score_columns)) with shared.ro_conn(score_db_file) as score_conn: score_gen = score_conn.cursor().execute(score_q) model_scores = { row[0]: dict(zip(self.model_score_columns, row[1:])) for row in score_gen } try: in_db_disk = apsw.Connection(in_db_file, flags=apsw.SQLITE_OPEN_READONLY) except apsw.CantOpenError: print in_db_file raise memcon = apsw.Connection(":memory:") with memcon.backup("main", in_db_disk, "main") as backup: backup.step() in_db_disk.close() in_db = memcon.cursor() mean = np.mean def gen_paths(): for prev_path in previous_path_list: if nwindows == 2: window_slice = 2 else: window_slice = cur_window # Only keep windows (remove score components) prev_path_windows = prev_path[:window_slice] row_dict = dict(zip(window_columns, prev_path_windows)) # Prepare score components mean_prev_edgescore = prev_path[edgescore_index] di_list = [ model_scores[modelid]['di'] for modelid in prev_path_windows ] if nwindows == 2: insert_dict = row_dict.copy() insert_dict["edgescores"] = mean_prev_edgescore insert_dict["nodescore"] = mean(di_list) yield insert_dict else: prev_edgescore = mean_prev_edgescore * n_prev_edges current_path_q = child_q_fmt.format(path=prev_path_windows) current_path_list = in_db.execute(current_path_q) for new_path in current_path_list: new_path_window = new_path[0] new_path_edgescore = new_path[new_edgescore_index] new_edgescore = (prev_edgescore + new_path_edgescore) / n_edges new_di_list = list(di_list) new_di_list.append(model_scores[new_path_window]['di']) nodescore = mean(new_di_list) insert_dict = row_dict.copy() insert_dict[new_window] = new_path_window insert_dict["edgescores"] = new_edgescore insert_dict["nodescore"] = nodescore yield insert_dict old_total_paths = total_paths try: out_db_conn = apsw.Connection(out_db_file, flags=apsw.SQLITE_OPEN_READWRITE | apsw.SQLITE_OPEN_CREATE) except apsw.CantOpenError: print out_db_file raise out_db_conn.cursor().execute(out_schema) # Outer transaction with out_db_conn: out_db_curs = out_db_conn.cursor() path_gen = gen_paths() for path_chunk in itertools.izip_longest(*[iter(path_gen)] * batchsize, fillvalue=None): # Inner transaction with out_db_conn: for row_dict in path_chunk: if row_dict is None: continue total_paths += 1 out_db_curs.execute(insert_sql, row_dict) # Cleanup memcon.close() out_db_conn.close() new_paths = old_total_paths - total_paths if not new_paths: raise FindPathsStepwiseError("No new paths were produced")