Beispiel #1
0
    def fetch_motif_by_id(self, id):
        """
        Fetch a single JASPAR motif from the DB by it's JASPAR matrix ID
        (e.g. 'MA0001.1').

        Arguments:
        id - JASPAR matrix ID. This may be a fully specified ID including the
             version number (e.g. MA0049.2) or just the base ID (e.g. MA0049).
             If only a base ID is provided, the latest version is returned.
        Returns:
        A Bio.motifs.jaspar.Motif object

        NOTE: The perl TFBS module allows you to specify the type of matrix to
        return (PFM, PWM, ICM) but matrices are always stored in JASAPR as
        PFMs so this does not really belong here. Once a PFM is fetched the
        pwm() and pssm() methods can be called to return the normalized and
        log-odds matrices.

        """
         
        # separate stable ID and version number
        (base_id, version) = jaspar.split_jaspar_id(id) 
        if not version:
            # if ID contains no version portion, fetch latest version by default
            version = self._fetch_latest_version(base_id)

        # fetch internal JASPAR matrix ID - also a check for validity
        int_id = self._fetch_internal_id(base_id, version)

        # fetch JASPAR motif using internal ID
        motif = self._fetch_motif_by_internal_id(int_id)

        return motif
Beispiel #2
0
    def fetch_motif_by_id(self, id):
        """
        Fetch a single JASPAR motif from the DB by it's JASPAR matrix ID
        (e.g. 'MA0001.1').

        Arguments:
        id - JASPAR matrix ID. This may be a fully specified ID including the
             version number (e.g. MA0049.2) or just the base ID (e.g. MA0049).
             If only a base ID is provided, the latest version is returned.
        Returns:
        A Bio.motifs.jaspar.Motif object

        NOTE: The perl TFBS module allows you to specify the type of matrix to
        return (PFM, PWM, ICM) but matrices are always stored in JASAPR as
        PFMs so this does not really belong here. Once a PFM is fetched the
        pwm() and pssm() methods can be called to return the normalized and
        log-odds matrices.

        """

        # separate stable ID and version number
        (base_id, version) = jaspar.split_jaspar_id(id)
        if not version:
            # if ID contains no version portion, fetch latest version by default
            version = self._fetch_latest_version(base_id)

        # fetch internal JASPAR matrix ID - also a check for validity
        int_id = self._fetch_internal_id(base_id, version)

        # fetch JASPAR motif using internal ID
        motif = self._fetch_motif_by_internal_id(int_id)

        return motif
Beispiel #3
0
    def _fetch_internal_id_list(
        self,
        collection=JASPAR_DFLT_COLLECTION,
        tf_name=None,
        tf_class=None,
        tf_family=None,
        matrix_id=None,
        tax_group=None,
        species=None,
        pazar_id=None,
        data_type=None,
        medline=None,
        all=False,
        all_versions=False,
    ):
        """Fetch list of internal JASPAR motif IDs.

        Fetch a list of internal JASPAR motif IDs based on various passed
        parameters which may then be used to fetch the rest of the motif data.

        Caller:
            fetch_motifs()

        Arguments:
            See arguments sections of fetch_motifs()

        Returns:
            A list of internal JASPAR motif IDs which match the given
            selection criteria arguments.


        Build an SQL query based on the selection arguments provided.

        1: First add table joins and sub-clauses for criteria corresponding to
           named fields from the MATRIX and MATRIX_SPECIES tables such as
           collection, matrix ID, name, species etc.

        2: Then add joins/sub-clauses for tag/value parameters from the
           MATRIX_ANNOTATION table.

        For the surviving matrices, the responsibility to do matrix-based
        feature filtering such as ic, number of sites etc, fall on the
        calling fetch_motifs() method.

        """
        int_ids = []

        cur = self.dbh.cursor()

        """
        Special case 1: fetch ALL motifs. Highest priority.
        Ignore all other selection arguments.
        """
        if all:
            cur.execute("select ID from MATRIX")
            rows = cur.fetchall()

            for row in rows:
                int_ids.append(row[0])

            return int_ids

        """
        Special case 2: fetch specific motifs by their JASPAR IDs. This
        has higher priority than any other except the above 'all' case.
        Ignore all other selection arguments.
        """
        if matrix_id:
            """
            These might be either stable IDs or stable_ID.version.
            If just stable ID and if all_versions == 1, return all versions,
            otherwise just the latest
            """
            if all_versions:
                for id in matrix_id:
                    # ignore version here, this is a stupidity filter
                    (base_id, version) = jaspar.split_jaspar_id(id)
                    cur.execute("select ID from MATRIX where BASE_ID = %s", (base_id,))

                    rows = cur.fetchall()
                    for row in rows:
                        int_ids.append(row[0])
            else:
                # only the latest version, or the requested version
                for id in matrix_id:
                    (base_id, version) = jaspar.split_jaspar_id(id)

                    if not version:
                        version = self._fetch_latest_version(base_id)

                    int_id = None
                    if version:
                        int_id = self._fetch_internal_id(base_id, version)

                    if int_id:
                        int_ids.append(int_id)

            return int_ids

        tables = ["MATRIX m"]
        where_clauses = []

        # Select by MATRIX.COLLECTION
        if collection:
            if isinstance(collection, list):
                # Multiple collections passed in as a list
                clause = "m.COLLECTION in ('"
                clause = "".join([clause, "','".join(collection)])
                clause = "".join([clause, "')"])
            else:
                # A single collection - typical usage
                clause = "m.COLLECTION = '%s'" % collection

            where_clauses.append(clause)

        # Select by MATRIX.NAME
        if tf_name:
            if isinstance(tf_name, list):
                # Multiple names passed in as a list
                clause = "m.NAME in ('"
                clause = "".join([clause, "','".join(tf_name)])
                clause = "".join([clause, "')"])
            else:
                # A single name
                clause = "m.NAME = '%s'" % tf_name

            where_clauses.append(clause)

        # Select by MATRIX_SPECIES.TAX_ID
        if species:
            tables.append("MATRIX_SPECIES ms")
            where_clauses.append("m.ID = ms.ID")

            """
            NOTE: species are numeric taxonomy IDs but stored as varchars
            in the DB.
            """
            if isinstance(species, list):
                # Multiple tax IDs passed in as a list
                clause = "ms.TAX_ID in ('"
                clause = "".join([clause, "','".join(str(s) for s in species)])
                clause = "".join([clause, "')"])
            else:
                # A single tax ID
                clause = "ms.TAX_ID = '%s'" % species

            where_clauses.append(clause)

        """
        Tag based selection from MATRIX_ANNOTATION
        Differs from perl TFBS module in that the matrix class explicitly
        has a tag attribute corresponding to the tags in the database. This
        provides tremendous flexibility in adding new tags to the DB and
        being able to select based on those tags with out adding new code.
        In the JASPAR Motif class we have elected to use specific attributes
        for the most commonly used tags and here correspondingly only allow
        selection on these attributes.

        The attributes corresponding to the tags for which selection is
        provided are:

           Attribute   Tag
           tf_class    class
           tf_family   family
           pazar_id    pazar_tf_id
           medline     medline
           data_type   type
           tax_group   tax_group
        """

        # Select by TF class(es) (MATRIX_ANNOTATION.TAG="class")
        if tf_class:
            tables.append("MATRIX_ANNOTATION ma1")
            where_clauses.append("m.ID = ma1.ID")

            clause = "ma1.TAG = 'class'"
            if isinstance(tf_class, list):
                # A list of TF classes
                clause = "".join([clause, " and ma1.VAL in ('"])
                clause = "".join([clause, "','".join(tf_class)])
                clause = "".join([clause, "')"])
            else:
                # A single TF class
                clause = "".join([clause, " and ma1.VAL = '%s' " % tf_class])

            where_clauses.append(clause)

        # Select by TF families (MATRIX_ANNOTATION.TAG="family")
        if tf_family:
            tables.append("MATRIX_ANNOTATION ma2")
            where_clauses.append("m.ID = ma2.ID")

            clause = "ma2.TAG = 'family'"
            if isinstance(tf_family, list):
                # A list of TF families
                clause = "".join([clause, " and ma2.VAL in ('"])
                clause = "".join([clause, "','".join(tf_family)])
                clause = "".join([clause, "')"])
            else:
                # A single TF family
                clause = "".join([clause, " and ma2.VAL = '%s' " % tf_family])

            where_clauses.append(clause)

        # Select by PAZAR TF ID(s) (MATRIX_ANNOTATION.TAG="pazar_tf_id")
        if pazar_id:
            tables.append("MATRIX_ANNOTATION ma3")
            where_clauses.append("m.ID = ma3.ID")

            clause = "ma3.TAG = 'pazar_tf_id'"
            if isinstance(pazar_id, list):
                # A list of PAZAR IDs
                clause = "".join([clause, " and ma3.VAL in ('"])
                clause = "".join([clause, "','".join(pazar_id)])
                clause = "".join([clause, "')"])
            else:
                # A single PAZAR ID
                clause = "".join([" and ma3.VAL = '%s' " % pazar_id])

            where_clauses.append(clause)

        # Select by PubMed ID(s) (MATRIX_ANNOTATION.TAG="medline")
        if medline:
            tables.append("MATRIX_ANNOTATION ma4")
            where_clauses.append("m.ID = ma4.ID")

            clause = "ma4.TAG = 'medline'"
            if isinstance(medline, list):
                # A list of PubMed IDs
                clause = "".join([clause, " and ma4.VAL in ('"])
                clause = "".join([clause, "','".join(medline)])
                clause = "".join([clause, "')"])
            else:
                # A single PubMed ID
                clause = "".join([" and ma4.VAL = '%s' " % medline])

            where_clauses.append(clause)

        # Select by data type(s) used to compile the matrix
        # (MATRIX_ANNOTATION.TAG="type")
        if data_type:
            tables.append("MATRIX_ANNOTATION ma5")
            where_clauses.append("m.ID = ma5.ID")

            clause = "ma5.TAG = 'type'"
            if isinstance(data_type, list):
                # A list of data types
                clause = "".join([clause, " and ma5.VAL in ('"])
                clause = "".join([clause, "','".join(data_type)])
                clause = "".join([clause, "')"])
            else:
                # A single data type
                clause = "".join([" and ma5.VAL = '%s' " % data_type])

            where_clauses.append(clause)

        # Select by taxonomic supergroup(s) (MATRIX_ANNOTATION.TAG="tax_group")
        if tax_group:
            tables.append("MATRIX_ANNOTATION ma6")
            where_clauses.append("m.ID = ma6.ID")

            clause = "ma6.TAG = 'tax_group'"
            if isinstance(tax_group, list):
                # A list of tax IDs
                clause = "".join([clause, " and ma6.VAL in ('"])
                clause = "".join([clause, "','".join(tax_group)])
                clause = "".join([clause, "')"])
            else:
                # A single tax ID
                clause = "".join([clause, " and ma6.VAL = '%s' " % tax_group])

            where_clauses.append(clause)

        sql = "".join(["select distinct(m.ID) from ", ", ".join(tables)])

        if where_clauses:
            sql = "".join([sql, " where ", " and ".join(where_clauses)])

        # print("sql = %s" % sql)

        cur.execute(sql)
        rows = cur.fetchall()

        for row in rows:
            id = row[0]
            if all_versions:
                int_ids.append(id)
            else:
                # is the latest version?
                if self._is_latest_version(id):
                    int_ids.append(id)

        if len(int_ids) < 1:
            warnings.warn(
                "Zero motifs returned with current select criteria", BiopythonWarning
            )

        return int_ids
Beispiel #4
0
    def _fetch_internal_id_list(
        self, collection=JASPAR_DFLT_COLLECTION, tf_name=None, tf_class=None,
        tf_family=None, matrix_id=None, tax_group=None, species=None,
        pazar_id=None, data_type=None, medline=None, all=False,
        all_versions=False
    ):
        """Fetch list of internal JASPAR motif IDs.

        Fetch a list of internal JASPAR motif IDs based on various passed
        parameters which may then be used to fetch the rest of the motif data.

        Caller:
            fetch_motifs()

        Arguments:
            See arguments sections of fetch_motifs()

        Returns:
            A list of internal JASPAR motif IDs which match the given
            selection criteria arguments.


        Build an SQL query based on the selection arguments provided.

        1: First add table joins and sub-clauses for criteria corresponding to
           named fields from the MATRIX and MATRIX_SPECIES tables such as
           collection, matrix ID, name, species etc.

        2: Then add joins/sub-clauses for tag/value parameters from the
           MATRIX_ANNOTATION table.

        For the surviving matrices, the responsibility to do matrix-based
        feature filtering such as ic, number of sites etc, fall on the
        calling fetch_motifs() method.

        """
        int_ids = []

        cur = self.dbh.cursor()

        """
        Special case 1: fetch ALL motifs. Highest priority.
        Ignore all other selection arguments.
        """
        if all:
            cur.execute("select ID from MATRIX")
            rows = cur.fetchall()

            for row in rows:
                int_ids.append(row[0])

            return int_ids

        """
        Special case 2: fetch specific motifs by their JASPAR IDs. This
        has higher priority than any other except the above 'all' case.
        Ignore all other selection arguments.
        """
        if matrix_id:
            """
            These might be either stable IDs or stable_ID.version.
            If just stable ID and if all_versions == 1, return all versions,
            otherwise just the latest
            """
            if all_versions:
                for id in matrix_id:
                    # ignore vesion here, this is a stupidity filter
                    (base_id, version) = jaspar.split_jaspar_id(id)
                    cur.execute(
                        "select ID from MATRIX where BASE_ID = %s", (base_id,)
                    )

                    rows = cur.fetchall()
                    for row in rows:
                        int_ids.append(row[0])
            else:
                # only the lastest version, or the requested version
                for id in matrix_id:
                    (base_id, version) = jaspar.split_jaspar_id(id)

                    if not version:
                        version = self._fetch_latest_version(base_id)

                    int_id = None
                    if version:
                        int_id = self._fetch_internal_id(base_id, version)

                    if int_id:
                        int_ids.append(int_id)

            return int_ids

        tables = ["MATRIX m"]
        where_clauses = []

        # Select by MATRIX.COLLECTION
        if collection:
            if isinstance(collection, list):
                # Multiple collections passed in as a list
                clause = "m.COLLECTION in ('"
                clause = "".join([clause, "','".join(collection)])
                clause = "".join([clause, "')"])
            else:
                # A single collection - typical usage
                clause = "m.COLLECTION = '%s'" % collection

            where_clauses.append(clause)

        # Select by MATRIX.NAME
        if tf_name:
            if isinstance(tf_name, list):
                # Multiple names passed in as a list
                clause = "m.NAME in ('"
                clause = "".join([clause, "','".join(tf_name)])
                clause = "".join([clause, "')"])
            else:
                # A single name
                clause = "m.NAME = '%s'" % tf_name

            where_clauses.append(clause)

        # Select by MATRIX_SPECIES.TAX_ID
        if species:
            tables.append("MATRIX_SPECIES ms")
            where_clauses.append("m.ID = ms.ID")

            """
            NOTE: species are numeric taxonomy IDs but stored as varchars
            in the DB.
            """
            if isinstance(species, list):
                # Multiple tax IDs passed in as a list
                clause = "ms.TAX_ID in ('"
                clause = "".join([clause, "','".join(str(s) for s in species)])
                clause = "".join([clause, "')"])
            else:
                # A single tax ID
                clause = "ms.TAX_ID = '%s'" % str(species)

            where_clauses.append(clause)

        """
        Tag based selection from MATRIX_ANNOTATION
        Differs from perl TFBS module in that the matrix class explicitly
        has a tag attribute corresponding to the tags in the database. This
        provides tremendous flexibility in adding new tags to the DB and
        being able to select based on those tags with out adding new code.
        In the JASPAR Motif class we have elected to use specific attributes
        for the most commonly used tags and here correspondingly only allow
        selection on these attributes.

        The attributes corresponding to the tags for which selection is
        provided are:

           Attribute   Tag
           tf_class    class
           tf_family   family
           pazar_id    pazar_tf_id
           medline     medline
           data_type   type
           tax_group   tax_group
        """

        # Select by TF class(es) (MATRIX_ANNOTATION.TAG="class")
        if tf_class:
            tables.append("MATRIX_ANNOTATION ma1")
            where_clauses.append("m.ID = ma1.ID")

            clause = "ma1.TAG = 'class'"
            if isinstance(tf_class, list):
                # A list of TF classes
                clause = "".join([clause, " and ma1.VAL in ('"])
                clause = "".join([clause, "','".join(tf_class)])
                clause = "".join([clause, "')"])
            else:
                # A single TF class
                clause = "".join([clause, " and ma1.VAL = '%s' " % tf_class])

            where_clauses.append(clause)

        # Select by TF families (MATRIX_ANNOTATION.TAG="family")
        if tf_family:
            tables.append("MATRIX_ANNOTATION ma2")
            where_clauses.append("m.ID = ma2.ID")

            clause = "ma2.TAG = 'family'"
            if isinstance(tf_family, list):
                # A list of TF families
                clause = "".join([clause, " and ma2.VAL in ('"])
                clause = "".join([clause, "','".join(tf_family)])
                clause = "".join([clause, "')"])
            else:
                # A single TF family
                clause = "".join([clause, " and ma2.VAL = '%s' " % tf_family])

            where_clauses.append(clause)

        # Select by PAZAR TF ID(s) (MATRIX_ANNOTATION.TAG="pazar_tf_id")
        if pazar_id:
            tables.append("MATRIX_ANNOTATION ma3")
            where_clauses.append("m.ID = ma3.ID")

            clause = "ma3.TAG = 'pazar_tf_id'"
            if isinstance(pazar_id, list):
                # A list of PAZAR IDs
                clause = "".join([clause, " and ma3.VAL in ('"])
                clause = "".join([clause, "','".join(pazar_id)])
                clause = "".join([clause, "')"])
            else:
                # A single PAZAR ID
                clause = "".join([" and ma3.VAL = '%s' " % pazar_id])

            where_clauses.append(clause)

        # Select by PubMed ID(s) (MATRIX_ANNOTATION.TAG="medline")
        if medline:
            tables.append("MATRIX_ANNOTATION ma4")
            where_clauses.append("m.ID = ma4.ID")

            clause = "ma4.TAG = 'medline'"
            if isinstance(medline, list):
                # A list of PubMed IDs
                clause = "".join([clause, " and ma4.VAL in ('"])
                clause = "".join([clause, "','".join(medline)])
                clause = "".join([clause, "')"])
            else:
                # A single PubMed ID
                clause = "".join([" and ma4.VAL = '%s' " % medline])

            where_clauses.append(clause)

        # Select by data type(s) used to compile the matrix
        # (MATRIX_ANNOTATION.TAG="type")
        if data_type:
            tables.append("MATRIX_ANNOTATION ma5")
            where_clauses.append("m.ID = ma5.ID")

            clause = "ma5.TAG = 'type'"
            if isinstance(data_type, list):
                # A list of data types
                clause = "".join([clause, " and ma5.VAL in ('"])
                clause = "".join([clause, "','".join(data_type)])
                clause = "".join([clause, "')"])
            else:
                # A single data type
                clause = "".join([" and ma5.VAL = '%s' " % data_type])

            where_clauses.append(clause)

        # Select by taxonomic supergroup(s) (MATRIX_ANNOTATION.TAG="tax_group")
        if tax_group:
            tables.append("MATRIX_ANNOTATION ma6")
            where_clauses.append("m.ID = ma6.ID")

            clause = "ma6.TAG = 'tax_group'"
            if isinstance(tax_group, list):
                # A list of tax IDs
                clause = "".join([clause, " and ma6.VAL in ('"])
                clause = "".join([clause, "','".join(tax_group)])
                clause = "".join([clause, "')"])
            else:
                # A single tax ID
                clause = "".join([clause, " and ma6.VAL = '%s' " % tax_group])

            where_clauses.append(clause)

        sql = "".join(["select distinct(m.ID) from ", ", ".join(tables)])

        if where_clauses:
            sql = "".join([sql, " where ", " and ".join(where_clauses)])

        # print "sql = %s" % sql

        cur.execute(sql)
        rows = cur.fetchall()

        for row in rows:
            id = row[0]
            if all_versions:
                int_ids.append(id)
            else:
                # is the latest version?
                if self._is_latest_version(id):
                    int_ids.append(id)

        if len(int_ids) < 1:
            warnings.warn("Zero motifs returned with current select critera",
                          BiopythonWarning)

        return int_ids