Ejemplo n.º 1
0
def main(directory, pdbid, r_ch, l_ch, input):

    wd = os.path.abspath(directory)
    subdir = os.path.join(wd, pdbid)
    complexid = "{0}{1}{2}".format(pdbid, r_ch, l_ch)
    complexdb = os.path.join(wd, "scores_{0}.db".format(complexid))

    if not shared.missing(complexdb):
        return

    # Initialize window data
    window_data = pd.read_csv(input)
    windows = list()
    fragments = list()
    # Windows are in 2nd-level subdirectories
    for window_dir in glob.iglob(os.path.join(subdir, "*")):
        # Skip 2nd-level files
        if not os.path.isdir(window_dir):
            continue
        subdir, windowindex = os.path.split(window_dir)
        windowindex = windowindex.lower().replace(pdbid.lower(), "")
        try:
            windowindex = int(windowindex)
        except Exception:
            raise CreateDatabaseError("Expected window directory format $PDBID$WINDOWINDEX (e.g. 1bfg1)")
        window_row = dict(windowindex=windowindex, window_wd=window_dir)
        windows.append(window_row)
        # Fragments are in 3rd-level subdirectories
        for fragment_dir in glob.iglob(os.path.join(window_dir, "*")):
            # Skip 3rd-level files
            if not os.path.isdir(fragment_dir):
                continue
            window_dir, fragmentindex = os.path.split(fragment_dir)
            fragment_row = dict(windowindex=windowindex, fragmentindex=fragmentindex)
            fragments.append(fragment_row)

    window_df = pd.merge(window_data, pd.DataFrame(windows), on="windowindex")

    # Create fragment database
    with shared.new_conn(complexdb) as windowconn:
        cursor = windowconn.cursor()
        # Insert windows into database
        cursor.execute(window_schema)
        w_insert = shared.create_insert_statement("window", window_df.columns)
        cursor.executemany(w_insert, window_df.to_dict("records"))
        # Insert fragments into database
        cursor.execute(fragment_schema)
        insert = shared.create_insert_statement("fragment", ["windowindex", "fragmentindex"])
        cursor.executemany(insert, fragments)
Ejemplo n.º 2
0
 def modeldist_sql(prev_window, cur_window, mode, tablename_fmt,
                   allowed_columns, allowed_fmt, disallowed_columns,
                   disallowed_fmt, index_fmt, **kwargs):
     mode = mode[0].lower()
     tablename = tablename_fmt.format(prev_window=prev_window,
                                      cur_window=cur_window)
     schema_fmt = dict(a=allowed_fmt, d=disallowed_fmt)[mode]
     schema = schema_fmt.format(tablename=tablename)
     columns = dict(a=allowed_columns, d=disallowed_columns)[mode]
     insert = shared.create_insert_statement(tablename=tablename,
                                             columns=columns)
     index = index_fmt.format(tablename=tablename, prev_window=prev_window)
     return dict(schema=schema, insert=insert, index=index)
Ejemplo n.º 3
0
    def prepare_sql(self, complexdb):

        sql_kwargs = dict(
            model_tablename="allmodel",
            model_choose_tablename="model",
            model_choose_columns=["modelid", "di", "coordinates"],
        )

        model_choose_schema = """
        CREATE TABLE IF NOT EXISTS {model_choose_tablename}
        (
        {model_choose_columns[0]} INTEGER PRIMARY KEY NOT NULL,
        {model_choose_columns[1]} REAL NOT NULL,
        {model_choose_columns[2]} TEXT NOT NULL,
        FOREIGN KEY({model_choose_columns[0]}) REFERENCES {model_tablename}({model_choose_columns[0]})
        )""".format(**sql_kwargs)

        model_choose_insert = shared.create_insert_statement(
            sql_kwargs['model_choose_tablename'],
            sql_kwargs['model_choose_columns'])

        n_model_q_fmt = """SELECT count(*) AS ct
        FROM {model_choose_tablename}
        JOIN {model_tablename} USING({model_choose_columns[0]})
        JOIN fragment f USING(fragmentindex, windowindex)
        WHERE windowindex={{windowindex}}
        """.format(**sql_kwargs)

        window_data_q_fmt = """SELECT {model_choose_columns[0]}, modelindex, fragmentindex,
        windowindex, m.dfire as dfire, itscore
        FROM {model_tablename} m JOIN fragment f USING(fragmentindex, windowindex)
        WHERE windowindex={{windowindex}} AND m.dfire IS NOT NULL
        """.format(**sql_kwargs)

        n_paths_q = "SELECT count(*) FROM {model_tablename}".format(
            **sql_kwargs)

        with shared.write_conn(complexdb) as conn:
            curs = conn.cursor()
            curs.execute(model_choose_schema)

        return dict(
            model_insert=model_choose_insert,
            window_data_q_fmt=window_data_q_fmt,
            n_model_q_fmt=n_model_q_fmt,
            n_paths=n_paths_q,
        )
Ejemplo n.º 4
0
    def make_sql(self, complexid, nwindows, limit=None):

        path_select_fmt = '''SELECT pathsid,
        {windows} FROM paths{nwindows}
        '''
        path_count_fmt = "SELECT count(*) FROM paths{nwindows}"
        if limit:
            limit_clause = "\nLIMIT {0}".format(limit)
            path_select_fmt += limit_clause
            path_count_fmt += limit_clause

        cluster_tablename = "clusters{nwindows}".format(nwindows=nwindows)
        cluster_columns = ["pathsid", "cid", "is_medoid", "clustersize"]
        cluster_kwargs = dict(nwindows=nwindows,
                              cluster_tablename=cluster_tablename,
                              cluster_columns=cluster_columns,
                              paths_tablename="paths{}".format(nwindows))

        path_count_sql = path_count_fmt.format(**cluster_kwargs)
        cluster_count_sql = """SELECT count(*) FROM {cluster_tablename}
        """.format(**cluster_kwargs)

        cluster_schemas = """
        CREATE TABLE {cluster_tablename}
        (
        {cluster_columns[0]} INTEGER PRIMARY KEY NOT NULL,
        {cluster_columns[1]} INTEGER NOT NULL,
        {cluster_columns[2]} INTEGER NOT NULL,
        {cluster_columns[3]} INTEGER,
        FOREIGN KEY({cluster_columns[0]}) REFERENCES {paths_tablename}({cluster_columns[0]})
        )""".format(**cluster_kwargs).split(";")

        cluster_insert = shared.create_insert_statement(
            cluster_tablename, cluster_columns[:-1])

        path_select_sql = path_select_fmt.format(windows=", ".join(
            "window{0}".format(x) for x in range(nwindows)),
                                                 **cluster_kwargs)

        logging.debug("\n%s", path_select_sql)

        return dict(path_count=path_count_sql,
                    cluster_count=cluster_count_sql,
                    path_select=path_select_sql,
                    cluster_schemas=cluster_schemas,
                    cluster_insert=cluster_insert)
Ejemplo n.º 5
0
    def load(self, complexdb, complexname, receptor_chain, ligand_chain,
             **kwargs):
        """
        Load model scores.
        """

        fragmentselect = """
        SELECT windowindex, fragmentindex, window_wd
        FROM fragment
        JOIN window USING(windowindex)"""

        model_ct_select = "SELECT COUNT(*) FROM allmodel WHERE fragmentindex=:fragmentindex AND windowindex=:windowindex"
        modelcolumns = [
            "modelindex", "dfire", "itscore", "fragmentindex", "windowindex"
        ]
        modelinsert = shared.create_insert_statement("allmodel", modelcolumns)

        pdbid = "{0}{1}{2}".format(complexname, receptor_chain, ligand_chain)
        logging.debug(pdbid)

        receptor_chain = receptor_chain.lower()[:1]
        ligand_chain = ligand_chain.lower()

        with shared.write_conn(complexdb) as conn:
            curs = conn.cursor()
            curs.execute(model_schema)
            ## INSERT MODELS
            fragment_rows = shared.conn_to_pandas(fragmentselect, conn)
            fragment_rows = fragment_rows.to_dict("records")
            for frag_row in fragment_rows:
                windowindex = frag_row['windowindex']
                fragmentindex = frag_row['fragmentindex']
                frag_wd = os.path.join(frag_row['window_wd'],
                                       str(fragmentindex))

                itscorefile = os.path.join(frag_wd, shared.itscore_file)
                model_itscores = shared.read_itscore(itscorefile, kind="model")

                nmodels = len(model_itscores)
                nmodelrows = curs.execute(
                    model_ct_select,
                    dict(fragmentindex=fragmentindex,
                         windowindex=windowindex)).fetchone()[0]
                if nmodelrows < nmodels:
                    # Drop partial table
                    if nmodelrows:
                        logging.debug("Dropping partial model table")
                        curs.execute("DELETE FROM model")

                    nitscore = len(model_itscores)

                    dfirefile = os.path.join(frag_wd, shared.goap_file)
                    model_dfires = shared.read_dfire_from_goap(dfirefile,
                                                               kind="model")

                    modelrows = pd.merge(model_itscores,
                                         model_dfires,
                                         on="modelindex",
                                         how="left")
                    ndi = len(modelrows)

                    if len(set((nitscore, ndi))) != 1:
                        logging.error("ITScores: %s", nitscore)
                        logging.error("IT lj dfire: %s", ndi)
                        raise LoadModelScoresError("Score number mismatch")

                    modelrows['fragmentindex'] = fragmentindex
                    modelrows['windowindex'] = windowindex
                    curs.executemany(modelinsert, modelrows.to_dict("records"))
Ejemplo n.º 6
0
    def make_paths(self, complexid, nwindows, batchsize):
        """Make paths for specific window size"""

        global total_paths
        global skipped_paths

        cur_window = nwindows - 1
        n_edges = cur_window
        prev_window = cur_window - 1
        n_prev_edges = prev_window

        out_db_file = self.out_db_file
        in_db_file = self.in_db_fmt.format(cur_window=cur_window)
        score_db_file = self.score_db_file

        out_tablename = self.out_tablename_fmt.format(nwindows=nwindows)

        out_column_list = ["window%s" % x for x in range(nwindows)]

        out_schema = [
            "CREATE TABLE {tablename} (".format(tablename=out_tablename)
        ]
        out_schema.append("pathsid INTEGER PRIMARY KEY NOT NULL,")
        out_schema.extend("%s INTEGER NOT NULL," % w_col
                          for w_col in out_column_list)
        out_schema.extend("%s REAL NOT NULL," % col
                          for col in self.path_score_columns)
        out_schema.append(
            "timestamp DATE DEFAULT (datetime('now', 'localtime')));")
        out_schema = "\n".join(out_schema)

        logging.debug("Out schema: %s", out_schema)

        out_column_list.extend(self.path_score_columns)
        insert_sql = shared.create_insert_statement(tablename=out_tablename,
                                                    columns=out_column_list)
        logging.debug("Out db: %s", out_db_file)
        logging.debug("Insert SQL: %s", insert_sql)

        if nwindows == 2:
            start_db_file = self.in_db_fmt.format(cur_window=1)
            start_table = self.in_tablename_fmt.format(prev_window=0,
                                                       cur_window=1)
            window_columns = ["window%s" % (x) for x in range(2)]
            start_columns = window_columns + self.pair_score_columns
            edgescore_index = start_columns.index("edgescore")
            start_query = """
            SELECT modela AS {windows[0]}, modelb AS {windows[1]}, {score_columns} FROM {tablename}
            """.format(tablename=start_table,
                       windows=window_columns,
                       score_columns=", ".join(self.pair_score_columns))
        elif nwindows == 3:
            start_db_file = self.in_db_fmt.format(cur_window=1)
            start_table = self.in_tablename_fmt.format(prev_window=0,
                                                       cur_window=1)
            window_columns = ["window%s" % (x) for x in range(2)]
            start_columns = window_columns + self.pair_score_columns
            edgescore_index = start_columns.index("edgescore")
            start_query = """
            SELECT modela AS {windows[0]}, modelb AS {windows[1]}, {score_columns} FROM {tablename}
            """.format(tablename=start_table,
                       windows=window_columns,
                       score_columns=", ".join(self.pair_score_columns))
        else:
            start_db_file = out_db_file
            start_table = self.out_tablename_fmt.format(nwindows=cur_window)
            window_columns = ["window%s" % (x) for x in range(cur_window)]
            start_columns = window_columns + self.path_score_columns
            edgescore_index = start_columns.index("edgescores")
            start_query = "SELECT {columns} FROM {tablename}".format(
                columns=", ".join(start_columns), tablename=start_table)
            cluster = self.cluster_tablename_fmt.format(nwindows=cur_window)
            start_query += " JOIN {clustertable} USING (pathsid) WHERE is_medoid=1".format(
                clustertable=cluster)

        logging.debug("Starting path db: %s", start_db_file)
        logging.debug("Starting paths query: %s", start_query)
        logging.debug("Edgescore index: %s", edgescore_index)

        logging.debug("Current path db: %s", in_db_file)

        new_window = "window{cur_window}".format(cur_window=cur_window)
        new_edgescore_index = self.pair_score_columns.index("edgescore") + 1
        child_q_fmt = """SELECT modelb AS {new_window}, {pair_score_columns} FROM modeldist{prev_window}{cur_window}
        WHERE modela={{path[{prev_window}]}}""".format(
            new_window=new_window,
            cur_window=cur_window,
            prev_window=prev_window,
            pair_score_columns=", ".join(self.pair_score_columns))
        for prior_window in range(prev_window):
            child_q_fmt += """
        AND window{cur_window} NOT IN (SELECT modelb FROM modeldist{prior_window}{cur_window}
        WHERE modela={{path[{prior_window}]}})""".format(
                cur_window=cur_window, prior_window=prior_window)

        if nwindows > 2:
            logging.debug("Next window query: %s", child_q_fmt)
            logging.debug("Edgescore index: %s", new_edgescore_index)

        with shared.ro_conn(start_db_file) as start_db:
            previous_path_list = list(start_db.cursor().execute(start_query))

        logging.debug("%s previous paths", len(previous_path_list))

        # Get single model scores
        score_q = "SELECT modelid, {model_score_columns} FROM model JOIN allmodel USING(modelid)".format(
            model_score_columns=", ".join(self.model_score_columns))
        with shared.ro_conn(score_db_file) as score_conn:
            score_gen = score_conn.cursor().execute(score_q)
            model_scores = {
                row[0]: dict(zip(self.model_score_columns, row[1:]))
                for row in score_gen
            }

        try:
            in_db_disk = apsw.Connection(in_db_file,
                                         flags=apsw.SQLITE_OPEN_READONLY)
        except apsw.CantOpenError:
            print in_db_file
            raise
        memcon = apsw.Connection(":memory:")
        with memcon.backup("main", in_db_disk, "main") as backup:
            backup.step()
        in_db_disk.close()
        in_db = memcon.cursor()

        mean = np.mean

        def gen_paths():
            for prev_path in previous_path_list:
                if nwindows == 2:
                    window_slice = 2
                else:
                    window_slice = cur_window
                # Only keep windows (remove score components)
                prev_path_windows = prev_path[:window_slice]
                row_dict = dict(zip(window_columns, prev_path_windows))
                # Prepare score components
                mean_prev_edgescore = prev_path[edgescore_index]
                di_list = [
                    model_scores[modelid]['di']
                    for modelid in prev_path_windows
                ]
                if nwindows == 2:
                    insert_dict = row_dict.copy()
                    insert_dict["edgescores"] = mean_prev_edgescore
                    insert_dict["nodescore"] = mean(di_list)
                    yield insert_dict
                else:
                    prev_edgescore = mean_prev_edgescore * n_prev_edges
                    current_path_q = child_q_fmt.format(path=prev_path_windows)
                    current_path_list = in_db.execute(current_path_q)
                    for new_path in current_path_list:
                        new_path_window = new_path[0]
                        new_path_edgescore = new_path[new_edgescore_index]
                        new_edgescore = (prev_edgescore +
                                         new_path_edgescore) / n_edges
                        new_di_list = list(di_list)
                        new_di_list.append(model_scores[new_path_window]['di'])
                        nodescore = mean(new_di_list)
                        insert_dict = row_dict.copy()
                        insert_dict[new_window] = new_path_window
                        insert_dict["edgescores"] = new_edgescore
                        insert_dict["nodescore"] = nodescore
                        yield insert_dict

        old_total_paths = total_paths
        try:
            out_db_conn = apsw.Connection(out_db_file,
                                          flags=apsw.SQLITE_OPEN_READWRITE
                                          | apsw.SQLITE_OPEN_CREATE)
        except apsw.CantOpenError:
            print out_db_file
            raise
        out_db_conn.cursor().execute(out_schema)
        # Outer transaction
        with out_db_conn:
            out_db_curs = out_db_conn.cursor()
            path_gen = gen_paths()
            for path_chunk in itertools.izip_longest(*[iter(path_gen)] *
                                                     batchsize,
                                                     fillvalue=None):
                # Inner transaction
                with out_db_conn:
                    for row_dict in path_chunk:
                        if row_dict is None:
                            continue
                        total_paths += 1
                        out_db_curs.execute(insert_sql, row_dict)

        # Cleanup
        memcon.close()
        out_db_conn.close()
        new_paths = old_total_paths - total_paths
        if not new_paths:
            raise FindPathsStepwiseError("No new paths were produced")