def fetch_motif_by_id(self, id): """ Fetch a single JASPAR motif from the DB by it's JASPAR matrix ID (e.g. 'MA0001.1'). Arguments: id - JASPAR matrix ID. This may be a fully specified ID including the version number (e.g. MA0049.2) or just the base ID (e.g. MA0049). If only a base ID is provided, the latest version is returned. Returns: A Bio.motifs.jaspar.Motif object NOTE: The perl TFBS module allows you to specify the type of matrix to return (PFM, PWM, ICM) but matrices are always stored in JASAPR as PFMs so this does not really belong here. Once a PFM is fetched the pwm() and pssm() methods can be called to return the normalized and log-odds matrices. """ # separate stable ID and version number (base_id, version) = jaspar.split_jaspar_id(id) if not version: # if ID contains no version portion, fetch latest version by default version = self._fetch_latest_version(base_id) # fetch internal JASPAR matrix ID - also a check for validity int_id = self._fetch_internal_id(base_id, version) # fetch JASPAR motif using internal ID motif = self._fetch_motif_by_internal_id(int_id) return motif
def _fetch_internal_id_list(self, collection=JASPAR_DFLT_COLLECTION, tf_name=None, tf_class=None, tf_family=None, matrix_id=None, tax_group=None, species=None, pazar_id=None, data_type=None, medline=None, all=False, all_versions=False): """ Fetch a list of internal JASPAR motif IDs based on various passed parameters which may then be used to fetch the rest of the motif data. Caller: fetch_motifs() Arguments: See arguments sections of fetch_motifs() Returns: A list of internal JASPAR motif IDs which match the given selection criteria arguments. Build an SQL query based on the selection arguments provided. 1: First add table joins and sub-clauses for criteria corresponding to named fields from the MATRIX and MATRIX_SPECIES tables such as collection, matrix ID, name, species etc. 2: Then add joins/sub-clauses for tag/value parameters from the MATRIX_ANNOTATION table. For the surviving matrices, the responsibility to do matrix-based feature filtering such as ic, number of sites etc, fall on the calling fetch_motifs() method. """ int_ids = [] cur = self.dbh.cursor() """ Special case 1: fetch ALL motifs. Highest priority. Ignore all other selection arguments. """ if all: cur.execute("select ID from MATRIX") rows = cur.fetchall() for row in rows: int_ids.append(row[0]) return int_ids """ Special case 2: fetch specific motifs by their JASPAR IDs. This has higher priority than any other except the above 'all' case. Ignore all other selection arguments. """ if matrix_id: """ These might be either stable IDs or stable_ID.version. If just stable ID and if all_versions == 1, return all versions, otherwise just the latest """ if all_versions: for id in matrix_id: # ignore vesion here, this is a stupidity filter (base_id, version) = jaspar.split_jaspar_id(id) cur.execute("select ID from MATRIX where BASE_ID = %s", base_id) rows = cur.fetchall() for row in rows: int_ids.append(row[0]) else: # only the lastest version, or the requested version for id in matrix_id: (base_id, version) = jaspar.split_jaspar_id(id) if not version: version = self._fetch_latest_version(base_id) int_id = self._fetch_internal_id(base_id, version) if int_id: int_ids.append(int_id) return int_ids tables = ["MATRIX m"] where_clauses = [] # Select by MATRIX.COLLECTION if collection: if isinstance(collection, list): # Multiple collections passed in as a list clause = "m.COLLECTION in ('" clause = "".join([clause, "','".join(collection)]) clause = "".join([clause, "')"]) else: # A single collection - typical usage clause = "m.COLLECTION = '%s'" % collection where_clauses.append(clause) # Select by MATRIX.NAME if tf_name: if isinstance(tf_name, list): # Multiple names passed in as a list clause = "m.NAME in ('" clause = "".join([clause, "','".join(tf_name)]) clause = "".join([clause, "')"]) else: # A single name clause = "m.NAME = '%s'" % tf_name where_clauses.append(clause) # Select by MATRIX_SPECIES.TAX_ID if species: tables.append("MATRIX_SPECIES ms") where_clauses.append("m.ID = ms.ID") """ NOTE: species are numeric taxonomy IDs but stored as varchars in the DB. """ if isinstance(species, list): # Multiple tax IDs passed in as a list clause = "ms.TAX_ID in ('" clause = "".join([clause, "','".join(str(s) for s in species)]) clause = "".join([clause, "')"]) else: # A single tax ID clause = "ms.TAX_ID = '%s'" % str(species) where_clauses.append(clause) """ Tag based selection from MATRIX_ANNOTATION Differs from perl TFBS module in that the matrix class explicitly has a tag attribute corresponding to the tags in the database. This provides tremendous flexibility in adding new tags to the DB and being able to select based on those tags with out adding new code. In the JASPAR Motif class we have elected to use specific attributes for the most commonly used tags and here correspondingly only allow selection on these attributes. The attributes corresponding to the tags for which selection is provided are: Attribute Tag tf_class class tf_family family pazar_id pazar_tf_id medline medline data_type type tax_group tax_group """ # Select by TF class(es) (MATRIX_ANNOTATION.TAG="class") if tf_class: tables.append("MATRIX_ANNOTATION ma1") where_clauses.append("m.ID = ma1.ID") clause = "ma1.TAG = 'class'" if isinstance(tf_class, list): # A list of TF classes clause = "".join([clause, " and ma1.VAL in ('"]) clause = "".join([clause, "','".join(tf_class)]) clause = "".join([clause, "')"]) else: # A single TF class clause = "".join([clause, " and ma1.VAL = '%s' " % tf_class]) where_clauses.append(clause) # Select by TF families (MATRIX_ANNOTATION.TAG="family") if tf_family: tables.append("MATRIX_ANNOTATION ma2") where_clauses.append("m.ID = ma2.ID") clause = "ma2.TAG = 'family'" if isinstance(tf_family, list): # A list of TF families clause = "".join([clause, " and ma2.VAL in ('"]) clause = "".join([clause, "','".join(tf_family)]) clause = "".join([clause, "')"]) else: # A single TF family clause = "".join([clause, " and ma2.VAL = '%s' " % tf_family]) where_clauses.append(clause) # Select by PAZAR TF ID(s) (MATRIX_ANNOTATION.TAG="pazar_tf_id") if pazar_id: tables.append("MATRIX_ANNOTATION ma3") where_clauses.append("m.ID = ma3.ID") clause = "ma3.TAG = 'pazar_tf_id'" if isinstance(pazar_id, list): # A list of PAZAR IDs clause = "".join([clause, " and ma3.VAL in ('"]) clause = "".join([clause, "','".join(pazar_id)]) clause = "".join([clause, "')"]) else: # A single PAZAR ID clause = "".join([" and ma3.VAL = '%s' " % pazar_id]) where_clauses.append(clause) # Select by PubMed ID(s) (MATRIX_ANNOTATION.TAG="medline") if medline: tables.append("MATRIX_ANNOTATION ma4") where_clauses.append("m.ID = ma4.ID") clause = "ma4.TAG = 'medline'" if isinstance(medline, list): # A list of PubMed IDs clause = "".join([clause, " and ma4.VAL in ('"]) clause = "".join([clause, "','".join(medline)]) clause = "".join([clause, "')"]) else: # A single PubMed ID clause = "".join([" and ma4.VAL = '%s' " % medline]) where_clauses.append(clause) # Select by data type(s) used to compile the matrix # (MATRIX_ANNOTATION.TAG="type") if data_type: tables.append("MATRIX_ANNOTATION ma5") where_clauses.append("m.ID = ma5.ID") clause = "ma5.TAG = 'type'" if isinstance(data_type, list): # A list of data types clause = "".join([clause, " and ma5.VAL in ('"]) clause = "".join([clause, "','".join(data_type)]) clause = "".join([clause, "')"]) else: # A single data type clause = "".join([" and ma5.VAL = '%s' " % data_type]) where_clauses.append(clause) # Select by taxonomic supergroup(s) (MATRIX_ANNOTATION.TAG="tax_group") if tax_group: tables.append("MATRIX_ANNOTATION ma6") where_clauses.append("m.ID = ma6.ID") clause = "ma6.TAG = 'tax_group'" if isinstance(tax_group, list): # A list of tax IDs clause = "".join([clause, " and ma6.VAL in ('"]) clause = "".join([clause, "','".join(tax_group)]) clause = "".join([clause, "')"]) else: # A single tax ID clause = "".join([clause, " and ma6.VAL = '%s' " % tax_group]) where_clauses.append(clause) sql = "".join(["select distinct(m.ID) from ", ", ".join(tables)]) if where_clauses: sql = "".join([sql, " where ", " and ".join(where_clauses)]) #print "sql = %s" % sql cur.execute(sql) rows = cur.fetchall() for row in rows: id = row[0] if all_versions: int_ids.append(id) else: # is the latest version? if self._is_latest_version(id): int_ids.append(id) if len(int_ids) < 1: warn("Warning: Zero motifs returned with current select critera") return int_ids
def _fetch_internal_id_list( self, collection=JASPAR_DFLT_COLLECTION, tf_name=None, tf_class=None, tf_family=None, matrix_id=None, tax_group=None, species=None, pazar_id=None, data_type=None, medline=None, all=False, all_versions=False ): """ Fetch a list of internal JASPAR motif IDs based on various passed parameters which may then be used to fetch the rest of the motif data. Caller: fetch_motifs() Arguments: See arguments sections of fetch_motifs() Returns: A list of internal JASPAR motif IDs which match the given selection criteria arguments. Build an SQL query based on the selection arguments provided. 1: First add table joins and sub-clauses for criteria corresponding to named fields from the MATRIX and MATRIX_SPECIES tables such as collection, matrix ID, name, species etc. 2: Then add joins/sub-clauses for tag/value parameters from the MATRIX_ANNOTATION table. For the surviving matrices, the responsibility to do matrix-based feature filtering such as ic, number of sites etc, fall on the calling fetch_motifs() method. """ int_ids = [] cur = self.dbh.cursor() """ Special case 1: fetch ALL motifs. Highest priority. Ignore all other selection arguments. """ if all: cur.execute("select ID from MATRIX") rows = cur.fetchall() for row in rows: int_ids.append(row[0]) return int_ids """ Special case 2: fetch specific motifs by their JASPAR IDs. This has higher priority than any other except the above 'all' case. Ignore all other selection arguments. """ if matrix_id: """ These might be either stable IDs or stable_ID.version. If just stable ID and if all_versions == 1, return all versions, otherwise just the latest """ if all_versions: for id in matrix_id: # ignore vesion here, this is a stupidity filter (base_id, version) = jaspar.split_jaspar_id(id) cur.execute( "select ID from MATRIX where BASE_ID = %s", base_id ) rows = cur.fetchall() for row in rows: int_ids.append(row[0]) else: # only the lastest version, or the requested version for id in matrix_id: (base_id, version) = jaspar.split_jaspar_id(id) if not version: version = self._fetch_latest_version(base_id) int_id = self._fetch_internal_id(base_id, version) if int_id: int_ids.append(int_id) return int_ids tables = ["MATRIX m"] where_clauses = [] # Select by MATRIX.COLLECTION if collection: if isinstance(collection, list): # Multiple collections passed in as a list clause = "m.COLLECTION in ('" clause = "".join([clause, "','".join(collection)]) clause = "".join([clause, "')"]) else: # A single collection - typical usage clause = "m.COLLECTION = '%s'" % collection where_clauses.append(clause) # Select by MATRIX.NAME if tf_name: if isinstance(tf_name, list): # Multiple names passed in as a list clause = "m.NAME in ('" clause = "".join([clause, "','".join(tf_name)]) clause = "".join([clause, "')"]) else: # A single name clause = "m.NAME = '%s'" % tf_name where_clauses.append(clause) # Select by MATRIX_SPECIES.TAX_ID if species: tables.append("MATRIX_SPECIES ms") where_clauses.append("m.ID = ms.ID") """ NOTE: species are numeric taxonomy IDs but stored as varchars in the DB. """ if isinstance(species, list): # Multiple tax IDs passed in as a list clause = "ms.TAX_ID in ('" clause = "".join([clause, "','".join(str(s) for s in species)]) clause = "".join([clause, "')"]) else: # A single tax ID clause = "ms.TAX_ID = '%s'" % str(species) where_clauses.append(clause) """ Tag based selection from MATRIX_ANNOTATION Differs from perl TFBS module in that the matrix class explicitly has a tag attribute corresponding to the tags in the database. This provides tremendous flexibility in adding new tags to the DB and being able to select based on those tags with out adding new code. In the JASPAR Motif class we have elected to use specific attributes for the most commonly used tags and here correspondingly only allow selection on these attributes. The attributes corresponding to the tags for which selection is provided are: Attribute Tag tf_class class tf_family family pazar_id pazar_tf_id medline medline data_type type tax_group tax_group """ # Select by TF class(es) (MATRIX_ANNOTATION.TAG="class") if tf_class: tables.append("MATRIX_ANNOTATION ma1") where_clauses.append("m.ID = ma1.ID") clause = "ma1.TAG = 'class'" if isinstance(tf_class, list): # A list of TF classes clause = "".join([clause, " and ma1.VAL in ('"]) clause = "".join([clause, "','".join(tf_class)]) clause = "".join([clause, "')"]) else: # A single TF class clause = "".join([clause, " and ma1.VAL = '%s' " % tf_class]) where_clauses.append(clause) # Select by TF families (MATRIX_ANNOTATION.TAG="family") if tf_family: tables.append("MATRIX_ANNOTATION ma2") where_clauses.append("m.ID = ma2.ID") clause = "ma2.TAG = 'family'" if isinstance(tf_family, list): # A list of TF families clause = "".join([clause, " and ma2.VAL in ('"]) clause = "".join([clause, "','".join(tf_family)]) clause = "".join([clause, "')"]) else: # A single TF family clause = "".join([clause, " and ma2.VAL = '%s' " % tf_family]) where_clauses.append(clause) # Select by PAZAR TF ID(s) (MATRIX_ANNOTATION.TAG="pazar_tf_id") if pazar_id: tables.append("MATRIX_ANNOTATION ma3") where_clauses.append("m.ID = ma3.ID") clause = "ma3.TAG = 'pazar_tf_id'" if isinstance(pazar_id, list): # A list of PAZAR IDs clause = "".join([clause, " and ma3.VAL in ('"]) clause = "".join([clause, "','".join(pazar_id)]) clause = "".join([clause, "')"]) else: # A single PAZAR ID clause = "".join([" and ma3.VAL = '%s' " % pazar_id]) where_clauses.append(clause) # Select by PubMed ID(s) (MATRIX_ANNOTATION.TAG="medline") if medline: tables.append("MATRIX_ANNOTATION ma4") where_clauses.append("m.ID = ma4.ID") clause = "ma4.TAG = 'medline'" if isinstance(medline, list): # A list of PubMed IDs clause = "".join([clause, " and ma4.VAL in ('"]) clause = "".join([clause, "','".join(medline)]) clause = "".join([clause, "')"]) else: # A single PubMed ID clause = "".join([" and ma4.VAL = '%s' " % medline]) where_clauses.append(clause) # Select by data type(s) used to compile the matrix # (MATRIX_ANNOTATION.TAG="type") if data_type: tables.append("MATRIX_ANNOTATION ma5") where_clauses.append("m.ID = ma5.ID") clause = "ma5.TAG = 'type'" if isinstance(data_type, list): # A list of data types clause = "".join([clause, " and ma5.VAL in ('"]) clause = "".join([clause, "','".join(data_type)]) clause = "".join([clause, "')"]) else: # A single data type clause = "".join([" and ma5.VAL = '%s' " % data_type]) where_clauses.append(clause) # Select by taxonomic supergroup(s) (MATRIX_ANNOTATION.TAG="tax_group") if tax_group: tables.append("MATRIX_ANNOTATION ma6") where_clauses.append("m.ID = ma6.ID") clause = "ma6.TAG = 'tax_group'" if isinstance(tax_group, list): # A list of tax IDs clause = "".join([clause, " and ma6.VAL in ('"]) clause = "".join([clause, "','".join(tax_group)]) clause = "".join([clause, "')"]) else: # A single tax ID clause = "".join([clause, " and ma6.VAL = '%s' " % tax_group]) where_clauses.append(clause) sql = "".join(["select distinct(m.ID) from ", ", ".join(tables)]) if where_clauses: sql = "".join([sql, " where ", " and ".join(where_clauses)]) #print "sql = %s" % sql cur.execute(sql) rows = cur.fetchall() for row in rows: id = row[0] if all_versions: int_ids.append(id) else: # is the latest version? if self._is_latest_version(id): int_ids.append(id) if len(int_ids) < 1: warn("Warning: Zero motifs returned with current select critera") return int_ids