Exemple #1
0
 def setUp(self):
     self.test_write_root = TEST_INVARIANTS['test_write_root']
     self.testdb = 'DB1'
     self.test_cols = ('A', 'B', 'C')
     self.test_dtname = 'Test1'
     self.dbm = DBManager(self.test_write_root)
     self.dt1 = DBTable(self.dbm,
                        self.testdb,
                        self.test_cols,
                        name=self.test_dtname)
     self.dt1.create()
     self.dt1.load(self.__gen1())
     self.rcs = self.dt1.db.cursor()
Exemple #2
0
    def __init__(self, dbm, db_key, protocol=None):
        r"""
Parameters
----------
dbm : :class:`~kdvs.core.db.DBManager`
    instance of DBManager that will control the database table

db_key : string
    identifier of the database table that will be used by DBManager

protocol : integer/None
    pickling protocol; if None, then the highest one will be used

See Also
--------
pickle
        """
        self.tmpl = DBSHELVE_TMPL
        self.dtsh = DBTable.fromTemplate(dbm, db_key, template=self.tmpl)
        if protocol is not None:
            self.protocol = protocol
        else:
            self.protocol = cPickle.HIGHEST_PROTOCOL
        self.dtsh.create(indexed_columns=self.tmpl['indexes'])
        self.key = self.tmpl['columns'][0]
        self.val = self.tmpl['columns'][1]
        self.name = self.tmpl['name']
        self.cs = self.dtsh.db.cursor()
Exemple #3
0
def generateHGNCSynonyms(hgnc_dsv, map_db_key):
    r"""
Create helper table that eases resolving of synonymic gene symbols with HGNC data.
The helper table may be created in different subordinated database than original
HGNC data. The table is specified via template.

Parameters
----------
hgnc_dsv : :class:`~kdvs.fw.DSV.DSV`
    valid instance of DSV that contains HGNC data

map_db_key : string
    ID of the database that will hold helper table

Returns
-------
synonymsDT : :class:`~kdvs.fw.DBTable.DBTable`
    DBTable wrapper for newly created helper table

See Also
--------
kdvs.fw.DBTable.DBTemplate
    """
    synonyms_dt = DBTable.fromTemplate(hgnc_dsv.dbm, map_db_key,
                                       HGNCSYNONYMS_TMPL)
    synonyms_dt.create(indexed_columns=HGNCSYNONYMS_TMPL['indexes'])
    syn_columns = (HGNC_APPROVED_SYMBOL_COL, HGNC_SYNONYMS_COL)
    syn_filter = "%s not like %s" % (quote(HGNC_SYNONYMS_COL),
                                     quote(HGNC_FIELD_EMPTY))
    hgnc_cs = hgnc_dsv.get(columns=syn_columns, filter_clause=syn_filter)

    def _gen():
        for syn_row in hgnc_cs:
            approved, rawsyns = [str(r) for r in syn_row]
            # parse synonyms
            if len(rawsyns) > 0:
                # parse synoyms
                syns = [s.strip() for s in rawsyns.split(HGNC_FIELDS_SEP)]
            else:
                # no synonyms, make approved its own synonym
                syns = [approved]
            for syn in syns:
                yield (syn, approved)

    synonyms_dt.load(_gen())
    hgnc_cs.close()
    return synonyms_dt
Exemple #4
0
class TestDBResult1(unittest.TestCase):
    def __gen1(self):
        nums = range(1, len(self.test_cols) + 1)
        for l in string.ascii_uppercase:
            yield tuple(["%s%s" % (l, n) for n in nums])

    def setUp(self):
        self.test_write_root = TEST_INVARIANTS['test_write_root']
        self.testdb = 'DB1'
        self.test_cols = ('A', 'B', 'C')
        self.test_dtname = 'Test1'
        self.dbm = DBManager(self.test_write_root)
        self.dt1 = DBTable(self.dbm,
                           self.testdb,
                           self.test_cols,
                           name=self.test_dtname)
        self.dt1.create()
        self.dt1.load(self.__gen1())
        self.rcs = self.dt1.db.cursor()

    def tearDown(self):
        self.dbm.close()
        db1_path = os.path.abspath('%s/%s.db' %
                                   (self.test_write_root, self.testdb))
        rootdb_path = os.path.abspath('%s/%s.root.db' %
                                      (self.test_write_root, SYSTEM_NAME_LC))
        if os.path.exists(db1_path):
            os.remove(db1_path)
        if os.path.exists(rootdb_path):
            os.remove(rootdb_path)
        self.dbm = None

    def testDBR_get1(self):
        self.rcs.execute('select * from %s' % self.test_dtname)
        dbr = DBResult(self.dt1, self.rcs)
        # get generator within cursor size limits
        res = list(dbr.get())
        ref_res = list(self.dt1.get())
        self.assertSequenceEqual(ref_res, res)

    def testDBR_get2(self):
        self.rcs.execute('select * from %s' % self.test_dtname)
        dbr = DBResult(self.dt1, self.rcs)
        # get all as iterable within cursor size limits
        res = dbr.getAll()
        ref_res = list(self.dt1.get())
        self.assertSequenceEqual(ref_res, res)

    def testDBR_get3(self):
        self.rcs.execute('select * from %s' % self.test_dtname)
        dbr = DBResult(self.dt1, self.rcs)
        # get all as iterable within cursor size limits
        res = dbr.getAll(as_dict=False, dict_on_rows=False)
        ref_res = list(self.dt1.get())
        self.assertSequenceEqual(ref_res, res)

    def testDBR_get4(self):
        self.rcs.execute('select * from %s' % self.test_dtname)
        dbr = DBResult(self.dt1, self.rcs)
        # get all as dict within cursor size limits, keyed on columns
        res = dbr.getAll(as_dict=True, dict_on_rows=False)
        ref_res = {}
        for ix, c in enumerate(self.test_cols):
            ref_res[c] = [
                u"%s%d" % (l, ix + 1) for l in string.ascii_uppercase
            ]
        self.assertDictEqual(ref_res, res)

    def testDBR_get5(self):
        self.rcs.execute('select * from %s' % self.test_dtname)
        dbr = DBResult(self.dt1, self.rcs)
        # get all as dict within cursor size limits, keyed on rows
        res = dbr.getAll(as_dict=True, dict_on_rows=True)
        ref_res = {}
        numsuffs = ["%d" % (i[0] + 1) for i in enumerate(self.test_cols)]
        for l in string.ascii_uppercase:
            key = u'%s%s' % (l, numsuffs[0])
            vls = [u'%s%s' % (l, ns) for ns in numsuffs[1:]]
            if l not in ref_res:
                ref_res[key] = vls
        self.assertDictEqual(ref_res, res)

    def testDBR_get6(self):
        # generate 100x load
        for _ in range(99):
            self.dt1.load(self.__gen1())
        self.rcs.execute('select * from %s' % self.test_dtname)
        # get all as iterable with limited cursor size, 26 internal loops
        dbr = DBResult(self.dt1, self.rcs, rowbufsize=100)
        # get all results at once
        res = list(dbr.get())
        numsuffs = ["%d" % (i[0] + 1) for i in enumerate(self.test_cols)]
        ref_res = []
        for i in range(100):
            for l in string.ascii_uppercase:
                tup = tuple([u'%s%s' % (l, ns) for ns in numsuffs])
                ref_res.append(tup)
        self.assertSequenceEqual(ref_res, res)

    def testDBR_get7(self):
        # generate 100x load
        for _ in range(99):
            self.dt1.load(self.__gen1())
        self.rcs.execute('select * from %s' % self.test_dtname)
        # get all as iterable with limited cursor size, 3 internal loops
        dbr = DBResult(self.dt1, self.rcs, rowbufsize=1000)
        # get all results at once
        res = list(dbr.get())
        numsuffs = ["%d" % (i[0] + 1) for i in enumerate(self.test_cols)]
        ref_res = []
        for i in range(100):
            for l in string.ascii_uppercase:
                tup = tuple([u'%s%s' % (l, ns) for ns in numsuffs])
                ref_res.append(tup)
        self.assertSequenceEqual(ref_res, res)

    def testDBR_get8(self):
        # generate 100x load
        for _ in range(99):
            self.dt1.load(self.__gen1())
        self.rcs.execute('select * from %s' % self.test_dtname)
        # get all as iterable with limited cursor size, 26 internal loops
        dbr = DBResult(self.dt1, self.rcs, rowbufsize=100)

        def __gen():
            numsuffs = ["%d" % (i[0] + 1) for i in enumerate(self.test_cols)]
            for i in range(100):
                for l in string.ascii_uppercase:
                    tup = tuple([u'%s%s' % (l, ns) for ns in numsuffs])
                    yield tup

        ref_gen = __gen()
        # iterate over single results
        for rtup in dbr.get():
            self.assertEqual(ref_gen.next(), rtup)
Exemple #5
0
    def build(self, anno_dsv, hgnc_dsv, map_db_key):
        r"""
Construct the mapping using resources already present in KDVS DB (via
:class:`~kdvs.core.db.DBManager`) and wrapped in :class:`~kdvs.fw.DSV.DSV`
instances. The mapping is built as database table and wrapped
into :class:`~kdvs.fw.DBTable.DBTable` instance; it is stored in public attribute
:attr:`dbt` of this instance. After the build is finished, the public attribute
:attr:`built` is set to True. This builder requires Affymetrix annotations data
already loaded in KDVS DB and wrapped in DSV instance.

Parameters
----------
anno_dsv : :class:`~kdvs.fw.DSV.DSV`
    valid instance of DSV that contains Affymetrix annotations data

hgnc_dsv : :class:`~kdvs.fw.DSV.DSV`
    currently unused, added for compatibility

map_db_key : string
    ID of the database that will hold mapping table

Raises
------
Error
    if DSV containing Affymetrix annotation data is incorrectly specified, is
    not created, or is empty
        """
        #
        # NOTE: in this map, we follow strictly the information from annotations,
        # without resolving symbols in HGNC data
        #
        # ---- check conditions for ANNO
        if not isinstance(anno_dsv, DSV):
            raise Error('%s instance expected! (got %s)' %
                        (DSV.__class__, anno_dsv.__class__))
        if not anno_dsv.isCreated():
            raise Error('Helper data table %s must be created first!' %
                        quote(anno_dsv.name))
        if anno_dsv.isEmpty():
            raise Error('Helper data table %s must not be empty!' %
                        quote(anno_dsv.name))
        # ---- create em2annotation
        em2annotation_dt = DBTable.fromTemplate(anno_dsv.dbm, map_db_key,
                                                EM2ANNOTATION_TMPL)
        em2annotation_dt.create(indexed_columns=EM2ANNOTATION_TMPL['indexes'])
        # query ANNO for basic annotations: probeset ID, representative public ID,
        # gene symbol, Entrez Gene ID, GB accession
        # NOTE: we need cursor due to sheer quantity of data
        query_anno_columns = (anno_dsv.id_column,
                              self._ANNO_REPR_PUBLIC_ID_COL,
                              self._ANNO_SEQ_SOURCE_COL,
                              self._ANNO_GENE_SYMBOL_COL,
                              self._ANNO_EGENE_ID_COL, self._ANNO_GB_ACC_COL)
        anno_cs = anno_dsv.get(columns=query_anno_columns)

        # build em2annotation
        def _gen():
            for arow in anno_cs:
                probeset_id, repr_pub_id, seqsrc, gss, egeneids, gbacc = [
                    str(ar) for ar in arow
                ]
                # reconstruct correct public ID
                pubid = '%s:%s' % (self._ANNO_SEQSRC_ABBRS[seqsrc],
                                   repr_pub_id)
                # NOTE: multifields "Gene Symbol" and "ENTREZ_GENE_ID" in
                # Affymetrix annotations are known not to be ordered accordingly;
                # therefore we simply unify multifield separator and report the
                # order as-is; it is up to the user to reconstruct correct
                # pairing when querying manually in Entrez Gene afterwards
                gs = MULTIFIELD_SEP.join(
                    [s.strip() for s in gss.split(self._ANNO_MULTIFIELD_SEP)])
                egeneid = MULTIFIELD_SEP.join([
                    s.strip()
                    for s in egeneids.split(self._ANNO_MULTIFIELD_SEP)
                ])
                yield (probeset_id, gs, pubid, gbacc, egeneid, '', '')

        em2annotation_dt.load(_gen())
        # ---- query em2annotation and build map
        # NOTE: we need cursor due to sheer quantity of data
        query_em2a_columns = (em2annotation_dt.id_column, 'gene_symbol')
        em2a_cs = em2annotation_dt.get(columns=query_em2a_columns)
        for em2a_row in em2a_cs:
            pr_id, gs = [str(r) for r in em2a_row]
            self.gene2emid[gs] = pr_id
        em2a_cs.close()
        self.built = True
        self.dbt = em2annotation_dt
Exemple #6
0
    def build(self, anno_dsv, map_db_key):
        r"""
Construct the mapping using resources already present in KDVS DB (via
:class:`~kdvs.core.db.DBManager`) and wrapped in :class:`~kdvs.fw.DSV.DSV`
instances. The mapping is built as database table and wrapped into
:class:`~kdvs.fw.DBTable.DBTable` instance; it is stored in public attribute
:attr:`dbt` of this instance. After the build is finished, the public attribute
:attr:`built` is set to True. This builder requires Affymetrix annotations data
already loaded in KDVS DB and wrapped in DSV instance.

Parameters
----------
anno_dsv : :class:`~kdvs.fw.DSV.DSV`
    valid instance of DSV that contains Affymetrix annotations data

map_db_key : string
    ID of the database that will hold mapping table

Raises
------
Error
    if DSV containing Affymetrix annotation data is incorrectly specified, is
    not created, or is empty
        """
        # NOTE: this map utilizes GO as prior knowledge sources and uses specific
        # features of this source, such as evidence codes

        # ---- check conditions
        if not isinstance(anno_dsv, DSV):
            raise Error('%s instance expected! (got %s)' %
                        (DSV.__class__, anno_dsv.__class__))
        if not anno_dsv.isCreated():
            raise Error('Helper data table %s must be created first!' %
                        quote(anno_dsv.name))
        if anno_dsv.isEmpty():
            raise Error('Helper data table %s must not be empty!' %
                        quote(anno_dsv.name))
        # ---- create goterm2em
        goterm2em_dt = DBTable.fromTemplate(anno_dsv.dbm, map_db_key,
                                            GOTERM2EM_TMPL)
        goterm2em_dt.create(indexed_columns=GOTERM2EM_TMPL['indexes'])
        # ---- specify data subset from ANNO
        query_domain_columns = (anno_dsv.id_column, self._SEQ_TYPE_COL,
                                self._GO_BP_COL, self._GO_MF_COL,
                                self._GO_CC_COL)
        ctrl_seq_tag = self._CTRL_SEQUENCE_TAG
        terms_missing = self._TERMS_MISSING
        term_separator = self._TERM_SEPARATOR
        term_part_separator = self._TERM_INTER_SEPARATOR
        # ---- query data subset and build term2probeset
        res = anno_dsv.getAll(columns=query_domain_columns, as_dict=False)

        def _build_map():
            for r in res:
                msid, seq_type, bp_s, mf_s, cc_s = [str(pr) for pr in r]
                if seq_type != ctrl_seq_tag:
                    for ns, terms_s in ((GO_BP_DS, bp_s), (GO_MF_DS, mf_s),
                                        (GO_CC_DS, cc_s)):
                        if terms_s != terms_missing:
                            for term in terms_s.split(term_separator):
                                tid, term_desc, ev_long = [
                                    x.strip()
                                    for x in term.split(term_part_separator)
                                ]
                                try:
                                    term_ev_code = GO_INV_EVIDENCE_CODES[
                                        ev_long]
                                except KeyError:
                                    term_ev_code = GO_UNKNOWN_EV_CODE
                                term_id = GO_num2id(tid)
                                yield term_id, msid, term_ev_code, term_desc, ns

        goterm2em_dt.load(_build_map())
        # ---- query term2probeset
        query_t2em_columns = (goterm2em_dt.id_column, 'em_id', 'term_domain')
        res = goterm2em_dt.getAll(columns=query_t2em_columns, as_dict=False)
        # build final map
        for r in res:
            tid, msid, dom = [str(pr) for pr in r]
            # update domain-unaware map
            self.pkc2emid[tid] = msid
            # update domain-aware map
            self.domains_map[dom][tid] = msid
        self.built = True
        self.dbt = goterm2em_dt
Exemple #7
0
    def build(self, anno_dsv, hgnc_dsv, map_db_key):
        r"""
Construct the mapping using resources already present in KDVS DB (via
:class:`~kdvs.core.db.DBManager`) and wrapped in :class:`~kdvs.fw.DSV.DSV`
instances. The mapping is built as database table and wrapped into
:class:`~kdvs.fw.DBTable.DBTable` instance; it is stored in public attribute :attr:`dbt`
of this instance. After the build is finished, the public attribute :attr:`built`
is set to True. This builder requires both Affymetrix annotations data and HGNC
data already loaded in KDVS DB and wrapped in DSV instances. Refer to the
comments for resolvancy protocol used.

Parameters
----------
anno_dsv : :class:`~kdvs.fw.DSV.DSV`
    valid instance of DSV that contains Affymetrix annotations data

hgnc_dsv : :class:`~kdvs.fw.DSV.DSV`
    valid instance of DSV that contains HGNC data

map_db_key : string
    ID of the database that will hold mapping table

Raises
------
Error
    if DSV containing Affymetrix annotation data is incorrectly specified, is
    not created, or is empty
Error
    if DSV containing HGNC data is incorrectly specified,
    is not created, or is empty
        """
        #
        # NOTE: this map follows resolvancy protocol implemented originally in
        # KDVS v 1.0.
        #
        # NOTE: in this map, we apply the following resolvancy protocol:
        # 1. get gene symbol(s) from annotations
        # 2. resolve them in HGNC data as follows:
        #    - if the symbol is approved and refers to gene, retain it
        #    - if the symbol is not approved, discard it
        # 3. for not discarded symbol(s) obtained in (2), get the following
        #    element(s) from HGNC data: Entrez Gene ID, Ensembl Gene ID, RefSeq IDs
        # 4. for not discarded symbol(s) obtained in (2), get the following
        #    element(s) from annotations: GB accession
        #
        # NOTE: in this map, we rely on annotations as the source of external
        # IDs; we still retain sequence public ID from annotations; gene symbols
        # are only verified if approved (i.e. the approval history is not followed)
        #
        # ---- check conditions for ANNO
        if not isinstance(anno_dsv, DSV):
            raise Error('%s instance expected! (got %s)' %
                        (DSV.__class__, anno_dsv.__class__))
        if not anno_dsv.isCreated():
            raise Error('Helper data table %s must be created first!' %
                        quote(anno_dsv.name))
        if anno_dsv.isEmpty():
            raise Error('Helper data table %s must not be empty!' %
                        quote(anno_dsv.name))
        # ---- check conditions for HGNC
        if not isinstance(hgnc_dsv, DSV):
            raise Error('%s instance expected! (got %s)' %
                        (DSV.__class__, hgnc_dsv.__class__))
        if not hgnc_dsv.isCreated():
            raise Error('Helper data table %s must be created first!' %
                        quote(hgnc_dsv.name))
        if hgnc_dsv.isEmpty():
            raise Error('Helper data table %s must not be empty!' %
                        quote(hgnc_dsv.name))
        # ---- create em2annotation
        em2annotation_dt = DBTable.fromTemplate(anno_dsv.dbm, map_db_key,
                                                EM2ANNOTATION_TMPL)
        em2annotation_dt.create(indexed_columns=EM2ANNOTATION_TMPL['indexes'])
        # pre-query data from HGNC: approved symbol, Entrez Gene ID,
        # Ensembl Gene ID, RefSeq IDs; filter non-gene entries
        # NOTE: we need cursor due to sheer quantity of data
        hgnc_data = dict()
        only_genes_filter = "%s in (%s)" % (quote(
            self._HGNC_LOCUS_TYPE_COL), ','.join(
                [quote(t) for t in self._HGNC_VALID_GENE_TYPES]))
        query_hgnc_columns = (self._HGNC_APPROVED_SYMBOL_COL,
                              self._HGNC_EGENE_ID_COL,
                              self._HGNC_ENSEMBL_ID_COL,
                              self._HGNC_REFSEQ_ID_COL)
        hgnc_cs = hgnc_dsv.get(columns=query_hgnc_columns,
                               filter_clause=only_genes_filter)
        for hgnc_row in hgnc_cs:
            approved, egeneid, ensembl, refseq = [str(r) for r in hgnc_row]
            hgnc_data[approved] = (egeneid, ensembl, refseq)
        hgnc_cs.close()
        # query ANNO for basic annotations: probeset ID, representative public ID,
        # gene symbol, GB accession
        # NOTE: we need cursor due to sheer quantity of data
        query_anno_columns = (anno_dsv.id_column,
                              self._ANNO_REPR_PUBLIC_ID_COL,
                              self._ANNO_SEQ_SOURCE_COL,
                              self._ANNO_GENE_SYMBOL_COL,
                              self._ANNO_GB_ACC_COL)
        anno_cs = anno_dsv.get(columns=query_anno_columns)

        def _gen():
            for arow in anno_cs:
                probeset_id, repr_pub_id, seqsrc, gss_str, gbacc = [
                    str(ar) for ar in arow
                ]
                # reconstruct correct public ID
                pubid = '%s:%s' % (self._ANNO_SEQSRC_ABBRS[seqsrc],
                                   repr_pub_id)
                # separate gene symbols
                gss = [
                    s.strip() for s in gss_str.split(self._ANNO_MULTIFIELD_SEP)
                ]
                gs_rec = list()
                egeneid_rec = list()
                ensembl_rec = list()
                refseq_rec = list()
                for gs in gss:
                    if gs in hgnc_data:
                        # gene symbol is approved
                        gs_rec.append(gs)
                        egeneid, ensembl, refseq = hgnc_data[gs]
                        egeneid_rec.append(egeneid)
                        ensembl_rec.append(ensembl)
                        refseq_rec.append(refseq)
                gs_s = MULTIFIELD_SEP.join(gs_rec)
                egeneid_s = MULTIFIELD_SEP.join(
                    egeneid_rec) if len(egeneid_rec) > 0 else ''
                ensembl_s = MULTIFIELD_SEP.join(
                    ensembl_rec) if len(ensembl_rec) > 0 else ''
                refseq_s = MULTIFIELD_SEP.join(
                    refseq_rec) if len(refseq_rec) > 0 else ''
                yield (probeset_id, gs_s, pubid, gbacc, egeneid_s, ensembl_s,
                       refseq_s)

        em2annotation_dt.load(_gen())
        # ---- query em2annotation and build map
        # NOTE: we need cursor due to sheer quantity of data
        query_em2a_columns = (em2annotation_dt.id_column, 'gene_symbol')
        em2a_cs = em2annotation_dt.get(columns=query_em2a_columns)
        for em2a_row in em2a_cs:
            pr_id, gs = [str(r) for r in em2a_row]
            self.gene2emid[gs] = pr_id
        em2a_cs.close()
        self.built = True
        self.dbt = em2annotation_dt