def setUp(self): self.test_write_root = TEST_INVARIANTS['test_write_root'] self.testdb = 'DB1' self.test_cols = ('A', 'B', 'C') self.test_dtname = 'Test1' self.dbm = DBManager(self.test_write_root) self.dt1 = DBTable(self.dbm, self.testdb, self.test_cols, name=self.test_dtname) self.dt1.create() self.dt1.load(self.__gen1()) self.rcs = self.dt1.db.cursor()
def __init__(self, dbm, db_key, protocol=None): r""" Parameters ---------- dbm : :class:`~kdvs.core.db.DBManager` instance of DBManager that will control the database table db_key : string identifier of the database table that will be used by DBManager protocol : integer/None pickling protocol; if None, then the highest one will be used See Also -------- pickle """ self.tmpl = DBSHELVE_TMPL self.dtsh = DBTable.fromTemplate(dbm, db_key, template=self.tmpl) if protocol is not None: self.protocol = protocol else: self.protocol = cPickle.HIGHEST_PROTOCOL self.dtsh.create(indexed_columns=self.tmpl['indexes']) self.key = self.tmpl['columns'][0] self.val = self.tmpl['columns'][1] self.name = self.tmpl['name'] self.cs = self.dtsh.db.cursor()
def generateHGNCSynonyms(hgnc_dsv, map_db_key): r""" Create helper table that eases resolving of synonymic gene symbols with HGNC data. The helper table may be created in different subordinated database than original HGNC data. The table is specified via template. Parameters ---------- hgnc_dsv : :class:`~kdvs.fw.DSV.DSV` valid instance of DSV that contains HGNC data map_db_key : string ID of the database that will hold helper table Returns ------- synonymsDT : :class:`~kdvs.fw.DBTable.DBTable` DBTable wrapper for newly created helper table See Also -------- kdvs.fw.DBTable.DBTemplate """ synonyms_dt = DBTable.fromTemplate(hgnc_dsv.dbm, map_db_key, HGNCSYNONYMS_TMPL) synonyms_dt.create(indexed_columns=HGNCSYNONYMS_TMPL['indexes']) syn_columns = (HGNC_APPROVED_SYMBOL_COL, HGNC_SYNONYMS_COL) syn_filter = "%s not like %s" % (quote(HGNC_SYNONYMS_COL), quote(HGNC_FIELD_EMPTY)) hgnc_cs = hgnc_dsv.get(columns=syn_columns, filter_clause=syn_filter) def _gen(): for syn_row in hgnc_cs: approved, rawsyns = [str(r) for r in syn_row] # parse synonyms if len(rawsyns) > 0: # parse synoyms syns = [s.strip() for s in rawsyns.split(HGNC_FIELDS_SEP)] else: # no synonyms, make approved its own synonym syns = [approved] for syn in syns: yield (syn, approved) synonyms_dt.load(_gen()) hgnc_cs.close() return synonyms_dt
class TestDBResult1(unittest.TestCase): def __gen1(self): nums = range(1, len(self.test_cols) + 1) for l in string.ascii_uppercase: yield tuple(["%s%s" % (l, n) for n in nums]) def setUp(self): self.test_write_root = TEST_INVARIANTS['test_write_root'] self.testdb = 'DB1' self.test_cols = ('A', 'B', 'C') self.test_dtname = 'Test1' self.dbm = DBManager(self.test_write_root) self.dt1 = DBTable(self.dbm, self.testdb, self.test_cols, name=self.test_dtname) self.dt1.create() self.dt1.load(self.__gen1()) self.rcs = self.dt1.db.cursor() def tearDown(self): self.dbm.close() db1_path = os.path.abspath('%s/%s.db' % (self.test_write_root, self.testdb)) rootdb_path = os.path.abspath('%s/%s.root.db' % (self.test_write_root, SYSTEM_NAME_LC)) if os.path.exists(db1_path): os.remove(db1_path) if os.path.exists(rootdb_path): os.remove(rootdb_path) self.dbm = None def testDBR_get1(self): self.rcs.execute('select * from %s' % self.test_dtname) dbr = DBResult(self.dt1, self.rcs) # get generator within cursor size limits res = list(dbr.get()) ref_res = list(self.dt1.get()) self.assertSequenceEqual(ref_res, res) def testDBR_get2(self): self.rcs.execute('select * from %s' % self.test_dtname) dbr = DBResult(self.dt1, self.rcs) # get all as iterable within cursor size limits res = dbr.getAll() ref_res = list(self.dt1.get()) self.assertSequenceEqual(ref_res, res) def testDBR_get3(self): self.rcs.execute('select * from %s' % self.test_dtname) dbr = DBResult(self.dt1, self.rcs) # get all as iterable within cursor size limits res = dbr.getAll(as_dict=False, dict_on_rows=False) ref_res = list(self.dt1.get()) self.assertSequenceEqual(ref_res, res) def testDBR_get4(self): self.rcs.execute('select * from %s' % self.test_dtname) dbr = DBResult(self.dt1, self.rcs) # get all as dict within cursor size limits, keyed on columns res = dbr.getAll(as_dict=True, dict_on_rows=False) ref_res = {} for ix, c in enumerate(self.test_cols): ref_res[c] = [ u"%s%d" % (l, ix + 1) for l in string.ascii_uppercase ] self.assertDictEqual(ref_res, res) def testDBR_get5(self): self.rcs.execute('select * from %s' % self.test_dtname) dbr = DBResult(self.dt1, self.rcs) # get all as dict within cursor size limits, keyed on rows res = dbr.getAll(as_dict=True, dict_on_rows=True) ref_res = {} numsuffs = ["%d" % (i[0] + 1) for i in enumerate(self.test_cols)] for l in string.ascii_uppercase: key = u'%s%s' % (l, numsuffs[0]) vls = [u'%s%s' % (l, ns) for ns in numsuffs[1:]] if l not in ref_res: ref_res[key] = vls self.assertDictEqual(ref_res, res) def testDBR_get6(self): # generate 100x load for _ in range(99): self.dt1.load(self.__gen1()) self.rcs.execute('select * from %s' % self.test_dtname) # get all as iterable with limited cursor size, 26 internal loops dbr = DBResult(self.dt1, self.rcs, rowbufsize=100) # get all results at once res = list(dbr.get()) numsuffs = ["%d" % (i[0] + 1) for i in enumerate(self.test_cols)] ref_res = [] for i in range(100): for l in string.ascii_uppercase: tup = tuple([u'%s%s' % (l, ns) for ns in numsuffs]) ref_res.append(tup) self.assertSequenceEqual(ref_res, res) def testDBR_get7(self): # generate 100x load for _ in range(99): self.dt1.load(self.__gen1()) self.rcs.execute('select * from %s' % self.test_dtname) # get all as iterable with limited cursor size, 3 internal loops dbr = DBResult(self.dt1, self.rcs, rowbufsize=1000) # get all results at once res = list(dbr.get()) numsuffs = ["%d" % (i[0] + 1) for i in enumerate(self.test_cols)] ref_res = [] for i in range(100): for l in string.ascii_uppercase: tup = tuple([u'%s%s' % (l, ns) for ns in numsuffs]) ref_res.append(tup) self.assertSequenceEqual(ref_res, res) def testDBR_get8(self): # generate 100x load for _ in range(99): self.dt1.load(self.__gen1()) self.rcs.execute('select * from %s' % self.test_dtname) # get all as iterable with limited cursor size, 26 internal loops dbr = DBResult(self.dt1, self.rcs, rowbufsize=100) def __gen(): numsuffs = ["%d" % (i[0] + 1) for i in enumerate(self.test_cols)] for i in range(100): for l in string.ascii_uppercase: tup = tuple([u'%s%s' % (l, ns) for ns in numsuffs]) yield tup ref_gen = __gen() # iterate over single results for rtup in dbr.get(): self.assertEqual(ref_gen.next(), rtup)
def build(self, anno_dsv, hgnc_dsv, map_db_key): r""" Construct the mapping using resources already present in KDVS DB (via :class:`~kdvs.core.db.DBManager`) and wrapped in :class:`~kdvs.fw.DSV.DSV` instances. The mapping is built as database table and wrapped into :class:`~kdvs.fw.DBTable.DBTable` instance; it is stored in public attribute :attr:`dbt` of this instance. After the build is finished, the public attribute :attr:`built` is set to True. This builder requires Affymetrix annotations data already loaded in KDVS DB and wrapped in DSV instance. Parameters ---------- anno_dsv : :class:`~kdvs.fw.DSV.DSV` valid instance of DSV that contains Affymetrix annotations data hgnc_dsv : :class:`~kdvs.fw.DSV.DSV` currently unused, added for compatibility map_db_key : string ID of the database that will hold mapping table Raises ------ Error if DSV containing Affymetrix annotation data is incorrectly specified, is not created, or is empty """ # # NOTE: in this map, we follow strictly the information from annotations, # without resolving symbols in HGNC data # # ---- check conditions for ANNO if not isinstance(anno_dsv, DSV): raise Error('%s instance expected! (got %s)' % (DSV.__class__, anno_dsv.__class__)) if not anno_dsv.isCreated(): raise Error('Helper data table %s must be created first!' % quote(anno_dsv.name)) if anno_dsv.isEmpty(): raise Error('Helper data table %s must not be empty!' % quote(anno_dsv.name)) # ---- create em2annotation em2annotation_dt = DBTable.fromTemplate(anno_dsv.dbm, map_db_key, EM2ANNOTATION_TMPL) em2annotation_dt.create(indexed_columns=EM2ANNOTATION_TMPL['indexes']) # query ANNO for basic annotations: probeset ID, representative public ID, # gene symbol, Entrez Gene ID, GB accession # NOTE: we need cursor due to sheer quantity of data query_anno_columns = (anno_dsv.id_column, self._ANNO_REPR_PUBLIC_ID_COL, self._ANNO_SEQ_SOURCE_COL, self._ANNO_GENE_SYMBOL_COL, self._ANNO_EGENE_ID_COL, self._ANNO_GB_ACC_COL) anno_cs = anno_dsv.get(columns=query_anno_columns) # build em2annotation def _gen(): for arow in anno_cs: probeset_id, repr_pub_id, seqsrc, gss, egeneids, gbacc = [ str(ar) for ar in arow ] # reconstruct correct public ID pubid = '%s:%s' % (self._ANNO_SEQSRC_ABBRS[seqsrc], repr_pub_id) # NOTE: multifields "Gene Symbol" and "ENTREZ_GENE_ID" in # Affymetrix annotations are known not to be ordered accordingly; # therefore we simply unify multifield separator and report the # order as-is; it is up to the user to reconstruct correct # pairing when querying manually in Entrez Gene afterwards gs = MULTIFIELD_SEP.join( [s.strip() for s in gss.split(self._ANNO_MULTIFIELD_SEP)]) egeneid = MULTIFIELD_SEP.join([ s.strip() for s in egeneids.split(self._ANNO_MULTIFIELD_SEP) ]) yield (probeset_id, gs, pubid, gbacc, egeneid, '', '') em2annotation_dt.load(_gen()) # ---- query em2annotation and build map # NOTE: we need cursor due to sheer quantity of data query_em2a_columns = (em2annotation_dt.id_column, 'gene_symbol') em2a_cs = em2annotation_dt.get(columns=query_em2a_columns) for em2a_row in em2a_cs: pr_id, gs = [str(r) for r in em2a_row] self.gene2emid[gs] = pr_id em2a_cs.close() self.built = True self.dbt = em2annotation_dt
def build(self, anno_dsv, map_db_key): r""" Construct the mapping using resources already present in KDVS DB (via :class:`~kdvs.core.db.DBManager`) and wrapped in :class:`~kdvs.fw.DSV.DSV` instances. The mapping is built as database table and wrapped into :class:`~kdvs.fw.DBTable.DBTable` instance; it is stored in public attribute :attr:`dbt` of this instance. After the build is finished, the public attribute :attr:`built` is set to True. This builder requires Affymetrix annotations data already loaded in KDVS DB and wrapped in DSV instance. Parameters ---------- anno_dsv : :class:`~kdvs.fw.DSV.DSV` valid instance of DSV that contains Affymetrix annotations data map_db_key : string ID of the database that will hold mapping table Raises ------ Error if DSV containing Affymetrix annotation data is incorrectly specified, is not created, or is empty """ # NOTE: this map utilizes GO as prior knowledge sources and uses specific # features of this source, such as evidence codes # ---- check conditions if not isinstance(anno_dsv, DSV): raise Error('%s instance expected! (got %s)' % (DSV.__class__, anno_dsv.__class__)) if not anno_dsv.isCreated(): raise Error('Helper data table %s must be created first!' % quote(anno_dsv.name)) if anno_dsv.isEmpty(): raise Error('Helper data table %s must not be empty!' % quote(anno_dsv.name)) # ---- create goterm2em goterm2em_dt = DBTable.fromTemplate(anno_dsv.dbm, map_db_key, GOTERM2EM_TMPL) goterm2em_dt.create(indexed_columns=GOTERM2EM_TMPL['indexes']) # ---- specify data subset from ANNO query_domain_columns = (anno_dsv.id_column, self._SEQ_TYPE_COL, self._GO_BP_COL, self._GO_MF_COL, self._GO_CC_COL) ctrl_seq_tag = self._CTRL_SEQUENCE_TAG terms_missing = self._TERMS_MISSING term_separator = self._TERM_SEPARATOR term_part_separator = self._TERM_INTER_SEPARATOR # ---- query data subset and build term2probeset res = anno_dsv.getAll(columns=query_domain_columns, as_dict=False) def _build_map(): for r in res: msid, seq_type, bp_s, mf_s, cc_s = [str(pr) for pr in r] if seq_type != ctrl_seq_tag: for ns, terms_s in ((GO_BP_DS, bp_s), (GO_MF_DS, mf_s), (GO_CC_DS, cc_s)): if terms_s != terms_missing: for term in terms_s.split(term_separator): tid, term_desc, ev_long = [ x.strip() for x in term.split(term_part_separator) ] try: term_ev_code = GO_INV_EVIDENCE_CODES[ ev_long] except KeyError: term_ev_code = GO_UNKNOWN_EV_CODE term_id = GO_num2id(tid) yield term_id, msid, term_ev_code, term_desc, ns goterm2em_dt.load(_build_map()) # ---- query term2probeset query_t2em_columns = (goterm2em_dt.id_column, 'em_id', 'term_domain') res = goterm2em_dt.getAll(columns=query_t2em_columns, as_dict=False) # build final map for r in res: tid, msid, dom = [str(pr) for pr in r] # update domain-unaware map self.pkc2emid[tid] = msid # update domain-aware map self.domains_map[dom][tid] = msid self.built = True self.dbt = goterm2em_dt
def build(self, anno_dsv, hgnc_dsv, map_db_key): r""" Construct the mapping using resources already present in KDVS DB (via :class:`~kdvs.core.db.DBManager`) and wrapped in :class:`~kdvs.fw.DSV.DSV` instances. The mapping is built as database table and wrapped into :class:`~kdvs.fw.DBTable.DBTable` instance; it is stored in public attribute :attr:`dbt` of this instance. After the build is finished, the public attribute :attr:`built` is set to True. This builder requires both Affymetrix annotations data and HGNC data already loaded in KDVS DB and wrapped in DSV instances. Refer to the comments for resolvancy protocol used. Parameters ---------- anno_dsv : :class:`~kdvs.fw.DSV.DSV` valid instance of DSV that contains Affymetrix annotations data hgnc_dsv : :class:`~kdvs.fw.DSV.DSV` valid instance of DSV that contains HGNC data map_db_key : string ID of the database that will hold mapping table Raises ------ Error if DSV containing Affymetrix annotation data is incorrectly specified, is not created, or is empty Error if DSV containing HGNC data is incorrectly specified, is not created, or is empty """ # # NOTE: this map follows resolvancy protocol implemented originally in # KDVS v 1.0. # # NOTE: in this map, we apply the following resolvancy protocol: # 1. get gene symbol(s) from annotations # 2. resolve them in HGNC data as follows: # - if the symbol is approved and refers to gene, retain it # - if the symbol is not approved, discard it # 3. for not discarded symbol(s) obtained in (2), get the following # element(s) from HGNC data: Entrez Gene ID, Ensembl Gene ID, RefSeq IDs # 4. for not discarded symbol(s) obtained in (2), get the following # element(s) from annotations: GB accession # # NOTE: in this map, we rely on annotations as the source of external # IDs; we still retain sequence public ID from annotations; gene symbols # are only verified if approved (i.e. the approval history is not followed) # # ---- check conditions for ANNO if not isinstance(anno_dsv, DSV): raise Error('%s instance expected! (got %s)' % (DSV.__class__, anno_dsv.__class__)) if not anno_dsv.isCreated(): raise Error('Helper data table %s must be created first!' % quote(anno_dsv.name)) if anno_dsv.isEmpty(): raise Error('Helper data table %s must not be empty!' % quote(anno_dsv.name)) # ---- check conditions for HGNC if not isinstance(hgnc_dsv, DSV): raise Error('%s instance expected! (got %s)' % (DSV.__class__, hgnc_dsv.__class__)) if not hgnc_dsv.isCreated(): raise Error('Helper data table %s must be created first!' % quote(hgnc_dsv.name)) if hgnc_dsv.isEmpty(): raise Error('Helper data table %s must not be empty!' % quote(hgnc_dsv.name)) # ---- create em2annotation em2annotation_dt = DBTable.fromTemplate(anno_dsv.dbm, map_db_key, EM2ANNOTATION_TMPL) em2annotation_dt.create(indexed_columns=EM2ANNOTATION_TMPL['indexes']) # pre-query data from HGNC: approved symbol, Entrez Gene ID, # Ensembl Gene ID, RefSeq IDs; filter non-gene entries # NOTE: we need cursor due to sheer quantity of data hgnc_data = dict() only_genes_filter = "%s in (%s)" % (quote( self._HGNC_LOCUS_TYPE_COL), ','.join( [quote(t) for t in self._HGNC_VALID_GENE_TYPES])) query_hgnc_columns = (self._HGNC_APPROVED_SYMBOL_COL, self._HGNC_EGENE_ID_COL, self._HGNC_ENSEMBL_ID_COL, self._HGNC_REFSEQ_ID_COL) hgnc_cs = hgnc_dsv.get(columns=query_hgnc_columns, filter_clause=only_genes_filter) for hgnc_row in hgnc_cs: approved, egeneid, ensembl, refseq = [str(r) for r in hgnc_row] hgnc_data[approved] = (egeneid, ensembl, refseq) hgnc_cs.close() # query ANNO for basic annotations: probeset ID, representative public ID, # gene symbol, GB accession # NOTE: we need cursor due to sheer quantity of data query_anno_columns = (anno_dsv.id_column, self._ANNO_REPR_PUBLIC_ID_COL, self._ANNO_SEQ_SOURCE_COL, self._ANNO_GENE_SYMBOL_COL, self._ANNO_GB_ACC_COL) anno_cs = anno_dsv.get(columns=query_anno_columns) def _gen(): for arow in anno_cs: probeset_id, repr_pub_id, seqsrc, gss_str, gbacc = [ str(ar) for ar in arow ] # reconstruct correct public ID pubid = '%s:%s' % (self._ANNO_SEQSRC_ABBRS[seqsrc], repr_pub_id) # separate gene symbols gss = [ s.strip() for s in gss_str.split(self._ANNO_MULTIFIELD_SEP) ] gs_rec = list() egeneid_rec = list() ensembl_rec = list() refseq_rec = list() for gs in gss: if gs in hgnc_data: # gene symbol is approved gs_rec.append(gs) egeneid, ensembl, refseq = hgnc_data[gs] egeneid_rec.append(egeneid) ensembl_rec.append(ensembl) refseq_rec.append(refseq) gs_s = MULTIFIELD_SEP.join(gs_rec) egeneid_s = MULTIFIELD_SEP.join( egeneid_rec) if len(egeneid_rec) > 0 else '' ensembl_s = MULTIFIELD_SEP.join( ensembl_rec) if len(ensembl_rec) > 0 else '' refseq_s = MULTIFIELD_SEP.join( refseq_rec) if len(refseq_rec) > 0 else '' yield (probeset_id, gs_s, pubid, gbacc, egeneid_s, ensembl_s, refseq_s) em2annotation_dt.load(_gen()) # ---- query em2annotation and build map # NOTE: we need cursor due to sheer quantity of data query_em2a_columns = (em2annotation_dt.id_column, 'gene_symbol') em2a_cs = em2annotation_dt.get(columns=query_em2a_columns) for em2a_row in em2a_cs: pr_id, gs = [str(r) for r in em2a_row] self.gene2emid[gs] = pr_id em2a_cs.close() self.built = True self.dbt = em2annotation_dt