Ejemplo n.º 1
0
    def countRows(self):
        r"""
Counts number of rows for the table. Table must be filled to obtain count >0.
Counting is performed with SQL standard function 'count' in underlying RDBMS.

Returns
-------
count : integer/None
    row count for the table, or None if underlying 'count' returns null

Raises
------
Error
    if table is not yet created
Error
    if counting was interrupted with an error; essentially, reraise
    OperationalError from underlying RDBMS
        """
        if not self.isCreated():
            raise Error('DataTable %s in %s must be first created!' % (quote(self.name), quote(self.db_key)))
        cs = self.db.cursor()
        cnt_st = 'select count(%s) from %s' % (quote(self.id_column), self.name)
        dberror = self.dbm.provider.getOperationalError()
        try:
            cs.execute(cnt_st)
        except dberror, e:
            raise Error('Cannot count from table %s in database %s! (Reason: %s)' % (quote(self.name), quote(self.db_key), e))
Ejemplo n.º 2
0
    def getIDs(self):
        r"""
Get content of designated ID column and return it as list of values.

Returns
-------
IDs : list of strings
    list of values from ID column, as queried by underlying RDBMS; the list is
    sorted in insert order

Raises
------
Error
    if table has not yet been created
Error
    if querying of ID column was interrupted with an error; essentially, reraise
    OperationalError from underlying RDBMS
        """
        if not self.isCreated():
            raise Error('DataTable %s in %s must be first created!' % (quote(self.name), quote(self.db_key)))
        cs = self.db.cursor()
        st = 'select %s from %s' % (quote(self.id_column), self.name)
        dberror = self.dbm.provider.getOperationalError()
        try:
            cs.execute(st)
        except dberror, e:
            raise Error('Cannot get IDs from table %s in database %s! (Reason: %s)' % (quote(self.name), quote(self.db_key), e))
Ejemplo n.º 3
0
def correctHGNCApprovedSymbols(hgnc_dsv):
    r"""
Manually correct HGNC data after loading into DB and wrapping into :class:`~kdvs.fw.DSV.DSV`
instance. It removes suffix (defined in :data:`HGNC_WITHDRAWN_GS_PART`)
from withdrawn symbols in order to fasten further querying. This call must be
used in the application that uses HGNC data, and must be made after HGNC data
has been loaded and wrapped into DSV instance.

Parameters
----------
hgnc_dsv : :class:`~kdvs.fw.DSV.DSV`
    valid instance of DSV that contains HGNC data
    """
    # Approved Symbol column in HGNC DSV data contains symbols with textual
    # suffix when symbol is withdrawn; to unify querying, we need to get
    # rid of the suffix
    # NOTE: in this exceptional case we use low level access to database
    # table in order to perform specific modification
    db = hgnc_dsv.db
    c = db.cursor()
    #    c.execute('update or ignore %s '
    #               'set "Approved Symbol"=rtrim("Approved Symbol","~withdrawn") '
    #               'where "Approved Symbol" like "%~withdrawn"'%(hgnc_table_name))
    st = 'update or ignore %s ' \
               'set %s=rtrim(%s,%s) ' \
               'where %s like %s' % (
                    hgnc_dsv.name, quote(HGNC_APPROVED_SYMBOL_COL),
                    quote(HGNC_APPROVED_SYMBOL_COL),
                    quote(HGNC_WITHDRAWN_GS_PART),
                    quote(HGNC_APPROVED_SYMBOL_COL),
                    quote('%' + HGNC_WITHDRAWN_GS_PART))
    c.execute(st)
    db.commit()
    c.close()
Ejemplo n.º 4
0
    def __init__(self, name=None, root_path=None, create_dbm=False):
        r"""
Parameters
----------
name : string/None
    name of the current instance; it will be used to identify all managed locations;
    if None, the name is generated randomly (UUID4)

root_path : string/None
    directory path that refers to the root of locations that will be managed by
    this instance; if None, default root path will be used ('~/.kdvs/')

create_dbm : boolean
    if True, default :class:`~kdvs.core.db.DBManager` instance will be created
    as well, rooted on specified root path; False by default

See Also
--------
uuid
os.path.expanduser
        """
        # ---- resolve instance name
        if name is None:
            self.name = uuid.uuid4().hex
        else:
            self.name = name
        # ---- resolve root_path
        self.def_root_path = os.path.expanduser('~/.%s/' % (SYSTEM_NAME_LC))
        if root_path is None:
            self.root_path = self.def_root_path
        else:
            self.root_path = root_path
        self.abs_root_path = os.path.abspath(self.root_path)
        # ---- check if root path is available and writable
        if not os.path.exists(self.abs_root_path):
            raise Error('Could not access root path %s for manager %s!' %
                        (quote(self.abs_root_path), quote(self.name)))
        if not self._check_path_writable(self.abs_root_path):
            raise Error('Could not write to root path %s of manager %s!' %
                        (quote(self.abs_root_path), quote(self.name)))
        # ---- setup locations management
        self.locations = {}
        self.sublocation_separator = SUBLOCATION_SEPARATOR
        # add ROOT location
        self.root_location_id = 'ROOT_%s' % self.name
        self.locations[self.root_location_id] = self.abs_root_path
        # ---- resolve DBM
        if create_dbm:
            # create default DBManager with file-based SQLite3 provider
            self.dbm = DBManager(self.root_path)
        else:
            self.dbm = None
Ejemplo n.º 5
0
 def _finalize_new_location(self, sublocs):
     locname = self.sublocation_separator.join(sublocs)
     locpathelem = os.path.sep.join(sublocs)
     locpath = os.path.join(self.abs_root_path, locpathelem)
     if self.getLocation(locname) is None:
         self.locations[locname] = locpath
         if not os.path.exists(locpath):
             try:
                 os.makedirs(locpath)
             except os.error, e:
                 raise Error(
                     'Could not create location %s under manager %s! (Reason: %s)'
                     % (quote(self.name), quote(locname), e))
Ejemplo n.º 6
0
 def _delete_location_from_sublocs(self, sublocs):
     locname = self.sublocation_separator.join(sublocs)
     locpath = self.getLocation(locname)
     try:
         del self.locations[locname]
     except KeyError:
         raise Error('Location %s not found under manager %s!' %
                     (quote(locname), quote(self.name)))
     try:
         shutil.rmtree(locpath)
     except (OSError, os.error) as e:
         raise Error(
             'Could not remove location %s under manager %s! (Reason: %s)' %
             (quote(locname), quote(self.name), e))
Ejemplo n.º 7
0
def GO_num2id(num):
    r"""
Resolve numerical part of GO term ID into full GO term ID.

Parameters
----------
num : integer/string
    supposed numerical part of GO term ID

Returns
-------
termID : string
    full GO term ID

Raises
------
Error
    if numerical part does not resolve to valid GO term ID
    """
    err = Error('Unrecognized numeric part of GO ID! (got %s)' %
                quote(str(num)))
    if isinstance(num, int):
        fmt = "%%0%dd" % _goid_num_length
        num = fmt % num
    elif isinstance(num, basestring):
        if _goid_num_patt.match(num) is None:
            raise err
    else:
        raise err
    return _goid_idprefix + num
Ejemplo n.º 8
0
    def removeJob(self, jobID):
        r"""
Remove requested job from this manager.

Parameters
----------
jobID : string
    job ID

Raises
------
Warn
    if the job is not yet executed and/or finished
Warn
    if the job has unrecognized status
        """
        jobStatus = self.getJobStatus(jobID)
        if jobStatus == JobStatus.FINISHED:
            del self.jobs[jobID]
        elif jobStatus == JobStatus.EXECUTING:
            raise Warn('The job is not yet finished!')
        elif jobStatus == JobStatus.ADDED:
            raise Warn('The job is not yet executed!')
        else:
            st = ','.join([quote(s) for s in JobStatus.statuses])
            raise Error('Unknown job status! (got %s) (shall be %s)' %
                        (jobStatus, st))
Ejemplo n.º 9
0
def generateHGNCSynonyms(hgnc_dsv, map_db_key):
    r"""
Create helper table that eases resolving of synonymic gene symbols with HGNC data.
The helper table may be created in different subordinated database than original
HGNC data. The table is specified via template.

Parameters
----------
hgnc_dsv : :class:`~kdvs.fw.DSV.DSV`
    valid instance of DSV that contains HGNC data

map_db_key : string
    ID of the database that will hold helper table

Returns
-------
synonymsDT : :class:`~kdvs.fw.DBTable.DBTable`
    DBTable wrapper for newly created helper table

See Also
--------
kdvs.fw.DBTable.DBTemplate
    """
    synonyms_dt = DBTable.fromTemplate(hgnc_dsv.dbm, map_db_key,
                                       HGNCSYNONYMS_TMPL)
    synonyms_dt.create(indexed_columns=HGNCSYNONYMS_TMPL['indexes'])
    syn_columns = (HGNC_APPROVED_SYMBOL_COL, HGNC_SYNONYMS_COL)
    syn_filter = "%s not like %s" % (quote(HGNC_SYNONYMS_COL),
                                     quote(HGNC_FIELD_EMPTY))
    hgnc_cs = hgnc_dsv.get(columns=syn_columns, filter_clause=syn_filter)

    def _gen():
        for syn_row in hgnc_cs:
            approved, rawsyns = [str(r) for r in syn_row]
            # parse synonyms
            if len(rawsyns) > 0:
                # parse synoyms
                syns = [s.strip() for s in rawsyns.split(HGNC_FIELDS_SEP)]
            else:
                # no synonyms, make approved its own synonym
                syns = [approved]
            for syn in syns:
                yield (syn, approved)

    synonyms_dt.load(_gen())
    hgnc_cs.close()
    return synonyms_dt
Ejemplo n.º 10
0
    def create(self, indexed_columns='*', debug=False):
        r"""
Physically create the table in underlying RDBMS; the creation is deferred until
this call. The table is created empty.

Parameters
----------
indexed_columns : list/tuple/'*'
    list/tuple of column names to be indexed by underlying RDBMS; if string '*'
    is specified, all columns will be indexed; '*' by default

debug : boolean
    provides debug mode for table creation; if True, collect all SQL statements
    produced by underlying RDBMS and return them as list of strings; if False,
    return None

Returns
-------
statements : list of strings/None
    RDBMS SQL statements issued during table creation, if debug mode is requested;
    or None otherwise

Raises
------
Error
    if table creation or indexing was interrupted with an error; essentially,
    reraise OperationalError from underlying RDBMS
        """
        statements = []
        # ---- create table
        cs = self.db.cursor()
        dberror = self.dbm.provider.getOperationalError()
        ctype = self.dbm.provider.getTextColumnType()
        # make columns
        cols = ','.join(['%s %s' % (quote(c), ctype) for c in self.columns])
        # make statement
        st = 'create table %s (%s)' % (quote(self.name), cols)
        if debug:
            statements.append(st)
        else:
            try:
                cs.execute(st)
            except dberror, e:
                raise Error('Cannot create table %s in database %s! (Reason: %s)' % (quote(self.name), quote(self.db_key), e))
Ejemplo n.º 11
0
    def load(self, content=emptyGenerator(), debug=False):
        r"""
Fill the already created table with some data, coming from specified generator
callable.

Parameters
----------
content : generator callable
    generator callable that furnish the data; this method DOES NOT check the
    correctness of furnished data, this is left to the user; by default, empty
    generator callable is used

debug : boolean
    provides debug mode for table filling; if True, collect all SQL statements
    produced by underlying RDBMS and return them as list of strings; if False,
    return None

Returns
-------
statements : list of strings/None
    RDBMS SQL statements issued during table filling, if debug mode is requested;
    or None otherwise

Raises
------
Error
    if table filling was interrupted with an error; essentially, reraise
    OperationalError from underlying RDBMS
        """
        statements = []
        cs = self.db.cursor()
        dberror = self.dbm.provider.getOperationalError()
        # ---- load content
        for cont in content:
            if len(cont) > 0:
                ct = ','.join([quote(f) for f in cont])
                st = 'insert into %s values (%s)' % (quote(self.name), ct)
                if debug:
                    statements.append(st)
                else:
                    try:
                        cs.execute(st)
                    except dberror, e:
                        raise Error('Cannot insert content %s into table %s in database %s! (Reason: %s)' % (quote(ct), quote(self.name), quote(self.db_key), e))
Ejemplo n.º 12
0
    def isEmpty(self):
        r"""
Returns True if the table is empty, False otherwise.

Raises
------
Error
    if table has not yet been created
        """
        if not self.isCreated():
            raise Error('DataTable %s must be created in %s first!' % (quote(self.name), quote(self.db_key)))
        return self.countRows() == 0
Ejemplo n.º 13
0
 def _resolve_dialect(self, filehandle, sniff_line_count=10):
     peek_lines = list(itertools.islice(filehandle, sniff_line_count))
     cf = self.getCommentSkipper(peek_lines)
     cflines = ''.join([l for l in cf])
     buf = StringIO.StringIO(cflines).getvalue()
     try:
         self.dialect = csv.Sniffer().sniff(buf)
         self.delimiter = self.dialect.delimiter
         filehandle.seek(0)
     except csv.Error, e:
         raise Error(
             'Could not determine dialect for file %s! (Reason: %s)' %
             (quote(filehandle.name), e))
Ejemplo n.º 14
0
 def __str__(self):
     if self.result != NOTPRODUCED:
         res = "PRODUCED"
     else:
         res = str(NOTPRODUCED)
     try:
         cfunc = quote(self.call_func.__name__)
     except:
         cfunc = "<Function>"
     if len(self.call_args) <= DEFAULT_CALLARGS_LISTING_THR:
         cargs = 'args: %s' % str(self.call_args)
     else:
         cargs = "args: >%s" % DEFAULT_CALLARGS_LISTING_THR
     jobstr = "<Job (%s with %s) (Status: %s, Result: %s)>" % (
         cfunc, cargs, self.status, res)
     return jobstr
Ejemplo n.º 15
0
 def test_loadall4(self):
     dsv2_fh = DSV.getHandle(self.anno_dsv_path)
     dsv2 = DSV(self.dbm, self.testdb, dsv2_fh, dtname=self.test_dtname, comment=self.anno_comment)
     self.assertSequenceEqual(self.anno_header, dsv2.header)
     dsv2.create()
     dsv2.loadAll()
     dsv2.close()
     # low level checks
     cs = dsv2.db.cursor()
     cs.execute('select %s from %s' % (self.anno_header[0], dsv2.name))
     rres = cs.fetchall()
     res = [str(r[0]) for r in rres]
     self.assertSequenceEqual(self.anno_rows, res)
     cols = ','.join([quote(c) for c in self.anno_columns])
     cs.execute('select %s from %s' % (cols, dsv2.name))
     rres = cs.fetchall()
     res = {}
     for ix, ac in enumerate(self.anno_columns):
         res[ac] = [str(r[ix]) for r in rres]
     self.assertDictEqual(self.anno_columns_dict, res)
Ejemplo n.º 16
0
 def _extract_header(self, filehandle, makeMissingID):
     cf = self.getCommentSkipper(filehandle)
     l1 = cf.next()
     l2 = cf.next()
     l1_spl = [s.strip() for s in l1.split(self.delimiter)]
     l2_spl = [s.strip() for s in l2.split(self.delimiter)]
     if len(l1_spl) == len(l2_spl):
         if makeMissingID:
             if len(l1_spl[0]) == 0:
                 l1_spl.pop(0)
                 l1_spl.insert(0, DSV_DEFAULT_ID_COLUMN)
         self.header = l1_spl
         # rewind exactly after header
         # first rewind file to the beginning
         filehandle.seek(0)
         # then consume header
         cf.next()
     else:
         raise Error(
             'Could not determine header for file %s! (Wrong header line?)'
             % quote(filehandle.name))
Ejemplo n.º 17
0
    def build(self, anno_dsv, hgnc_dsv, map_db_key):
        r"""
Construct the mapping using resources already present in KDVS DB (via
:class:`~kdvs.core.db.DBManager`) and wrapped in :class:`~kdvs.fw.DSV.DSV`
instances. The mapping is built as database table and wrapped into
:class:`~kdvs.fw.DBTable.DBTable` instance; it is stored in public attribute :attr:`dbt`
of this instance. After the build is finished, the public attribute :attr:`built`
is set to True. This builder requires both Affymetrix annotations data and HGNC
data already loaded in KDVS DB and wrapped in DSV instances. Refer to the
comments for resolvancy protocol used.

Parameters
----------
anno_dsv : :class:`~kdvs.fw.DSV.DSV`
    valid instance of DSV that contains Affymetrix annotations data

hgnc_dsv : :class:`~kdvs.fw.DSV.DSV`
    valid instance of DSV that contains HGNC data

map_db_key : string
    ID of the database that will hold mapping table

Raises
------
Error
    if DSV containing Affymetrix annotation data is incorrectly specified, is
    not created, or is empty
Error
    if DSV containing HGNC data is incorrectly specified,
    is not created, or is empty
        """
        #
        # NOTE: this map follows resolvancy protocol implemented originally in
        # KDVS v 1.0.
        #
        # NOTE: in this map, we apply the following resolvancy protocol:
        # 1. get gene symbol(s) from annotations
        # 2. resolve them in HGNC data as follows:
        #    - if the symbol is approved and refers to gene, retain it
        #    - if the symbol is not approved, discard it
        # 3. for not discarded symbol(s) obtained in (2), get the following
        #    element(s) from HGNC data: Entrez Gene ID, Ensembl Gene ID, RefSeq IDs
        # 4. for not discarded symbol(s) obtained in (2), get the following
        #    element(s) from annotations: GB accession
        #
        # NOTE: in this map, we rely on annotations as the source of external
        # IDs; we still retain sequence public ID from annotations; gene symbols
        # are only verified if approved (i.e. the approval history is not followed)
        #
        # ---- check conditions for ANNO
        if not isinstance(anno_dsv, DSV):
            raise Error('%s instance expected! (got %s)' %
                        (DSV.__class__, anno_dsv.__class__))
        if not anno_dsv.isCreated():
            raise Error('Helper data table %s must be created first!' %
                        quote(anno_dsv.name))
        if anno_dsv.isEmpty():
            raise Error('Helper data table %s must not be empty!' %
                        quote(anno_dsv.name))
        # ---- check conditions for HGNC
        if not isinstance(hgnc_dsv, DSV):
            raise Error('%s instance expected! (got %s)' %
                        (DSV.__class__, hgnc_dsv.__class__))
        if not hgnc_dsv.isCreated():
            raise Error('Helper data table %s must be created first!' %
                        quote(hgnc_dsv.name))
        if hgnc_dsv.isEmpty():
            raise Error('Helper data table %s must not be empty!' %
                        quote(hgnc_dsv.name))
        # ---- create em2annotation
        em2annotation_dt = DBTable.fromTemplate(anno_dsv.dbm, map_db_key,
                                                EM2ANNOTATION_TMPL)
        em2annotation_dt.create(indexed_columns=EM2ANNOTATION_TMPL['indexes'])
        # pre-query data from HGNC: approved symbol, Entrez Gene ID,
        # Ensembl Gene ID, RefSeq IDs; filter non-gene entries
        # NOTE: we need cursor due to sheer quantity of data
        hgnc_data = dict()
        only_genes_filter = "%s in (%s)" % (quote(
            self._HGNC_LOCUS_TYPE_COL), ','.join(
                [quote(t) for t in self._HGNC_VALID_GENE_TYPES]))
        query_hgnc_columns = (self._HGNC_APPROVED_SYMBOL_COL,
                              self._HGNC_EGENE_ID_COL,
                              self._HGNC_ENSEMBL_ID_COL,
                              self._HGNC_REFSEQ_ID_COL)
        hgnc_cs = hgnc_dsv.get(columns=query_hgnc_columns,
                               filter_clause=only_genes_filter)
        for hgnc_row in hgnc_cs:
            approved, egeneid, ensembl, refseq = [str(r) for r in hgnc_row]
            hgnc_data[approved] = (egeneid, ensembl, refseq)
        hgnc_cs.close()
        # query ANNO for basic annotations: probeset ID, representative public ID,
        # gene symbol, GB accession
        # NOTE: we need cursor due to sheer quantity of data
        query_anno_columns = (anno_dsv.id_column,
                              self._ANNO_REPR_PUBLIC_ID_COL,
                              self._ANNO_SEQ_SOURCE_COL,
                              self._ANNO_GENE_SYMBOL_COL,
                              self._ANNO_GB_ACC_COL)
        anno_cs = anno_dsv.get(columns=query_anno_columns)

        def _gen():
            for arow in anno_cs:
                probeset_id, repr_pub_id, seqsrc, gss_str, gbacc = [
                    str(ar) for ar in arow
                ]
                # reconstruct correct public ID
                pubid = '%s:%s' % (self._ANNO_SEQSRC_ABBRS[seqsrc],
                                   repr_pub_id)
                # separate gene symbols
                gss = [
                    s.strip() for s in gss_str.split(self._ANNO_MULTIFIELD_SEP)
                ]
                gs_rec = list()
                egeneid_rec = list()
                ensembl_rec = list()
                refseq_rec = list()
                for gs in gss:
                    if gs in hgnc_data:
                        # gene symbol is approved
                        gs_rec.append(gs)
                        egeneid, ensembl, refseq = hgnc_data[gs]
                        egeneid_rec.append(egeneid)
                        ensembl_rec.append(ensembl)
                        refseq_rec.append(refseq)
                gs_s = MULTIFIELD_SEP.join(gs_rec)
                egeneid_s = MULTIFIELD_SEP.join(
                    egeneid_rec) if len(egeneid_rec) > 0 else ''
                ensembl_s = MULTIFIELD_SEP.join(
                    ensembl_rec) if len(ensembl_rec) > 0 else ''
                refseq_s = MULTIFIELD_SEP.join(
                    refseq_rec) if len(refseq_rec) > 0 else ''
                yield (probeset_id, gs_s, pubid, gbacc, egeneid_s, ensembl_s,
                       refseq_s)

        em2annotation_dt.load(_gen())
        # ---- query em2annotation and build map
        # NOTE: we need cursor due to sheer quantity of data
        query_em2a_columns = (em2annotation_dt.id_column, 'gene_symbol')
        em2a_cs = em2annotation_dt.get(columns=query_em2a_columns)
        for em2a_row in em2a_cs:
            pr_id, gs = [str(r) for r in em2a_row]
            self.gene2emid[gs] = pr_id
        em2a_cs.close()
        self.built = True
        self.dbt = em2annotation_dt
Ejemplo n.º 18
0
class DBTable(object):
    r"""
Low--level wrapper over database table managed by KDVS DB manager. KDVS uses
database tables to manage query--intensive information, such as the robust
generation of data subsets from single main input data set. The wrapper encapsulates
basic functionality incl. table creation, table filling from specific generator
function, querying with conditions over colums and rows (in case where first
column holds row IDs), generation of associated :class:`numpy.ndarray` object
(if possible), as well as basic counting routines.
    """
    def __init__(self, dbm, db_key, columns, name=None, id_col=None):
        r"""
Parameters
----------
dbm : :class:`~kdvs.core.db.DBManager`
    an instance of DB manager that is managing this table

db_key : string
    internal ID of the table used by DB manager instance; it is NOT the name of
    physical database table in underlying RDBMS; typically, user of DB manager
    refers to the table by this ID and not by its physical name

columns : list/tuple of strings
    column names for the table

name : string/None
    physical name of database table in underlying RDBMS; if None, the name is
    generated semi--randomly; NOTE: ordinary user of DB manager shall refer to
    the table with 'db_key' ID

id_col : string/None
    designates specific column to be "ID column"; if None, the first column is
    designated as ID column

Raises
------
Error
    if DBManager instance is not present
Error
    if list/tuple with column names is not present
Error
    if ID column name is not the one of existing columns
        """
        # ---- resolve DBManager
        if not isinstance(dbm, DBManager):
            raise Error('%s instance expected! (got %s)' % (DBManager.__class__, dbm.__class__))
        else:
            self.dbm = dbm
        # ---- get target DB
        db = dbm.getDB(db_key)
        self.db_key = db_key
        self.db = db
        # ---- resolve columns
        if isListOrTuple(columns):
            self.columns = tuple(columns)
        else:
            raise Error('List or tuple expected! (got %s)' % columns.__class__)
        # ---- resolve ID column
        if id_col is None:
            self.id_column_idx = 0
            self.id_column = self.columns[0]
        else:
            if id_col in self.columns:
                self.id_column_idx = self.columns.index(id_col)
                self.id_column = id_col
            else:
                raise Error('ID column must be one of the existing columns! (got %s)' % id_col)
        # ---- resolve table name
        if name is None:
            self.name = '%s%s' % (self.__class__.__name__, uuid.uuid4().hex)
        else:
            self.name = name

    def create(self, indexed_columns='*', debug=False):
        r"""
Physically create the table in underlying RDBMS; the creation is deferred until
this call. The table is created empty.

Parameters
----------
indexed_columns : list/tuple/'*'
    list/tuple of column names to be indexed by underlying RDBMS; if string '*'
    is specified, all columns will be indexed; '*' by default

debug : boolean
    provides debug mode for table creation; if True, collect all SQL statements
    produced by underlying RDBMS and return them as list of strings; if False,
    return None

Returns
-------
statements : list of strings/None
    RDBMS SQL statements issued during table creation, if debug mode is requested;
    or None otherwise

Raises
------
Error
    if table creation or indexing was interrupted with an error; essentially,
    reraise OperationalError from underlying RDBMS
        """
        statements = []
        # ---- create table
        cs = self.db.cursor()
        dberror = self.dbm.provider.getOperationalError()
        ctype = self.dbm.provider.getTextColumnType()
        # make columns
        cols = ','.join(['%s %s' % (quote(c), ctype) for c in self.columns])
        # make statement
        st = 'create table %s (%s)' % (quote(self.name), cols)
        if debug:
            statements.append(st)
        else:
            try:
                cs.execute(st)
            except dberror, e:
                raise Error('Cannot create table %s in database %s! (Reason: %s)' % (quote(self.name), quote(self.db_key), e))
        # ---- create indexes
        # resolve indexed columns
        if indexed_columns == '*':
            indexed = tuple(self.columns)
        else:
            if isListOrTuple(indexed_columns):
                indexed = tuple(indexed_columns)
            else:
                raise Error('List or tuple expected! (got %s)' % indexed_columns.__class__)
        # make indexes
        for ic in indexed:
            idx_name = '%s__%s' % (self.name, ic)
            idx_st = 'create index %s on %s(%s)' % (quote(idx_name), quote(self.name), quote(ic))
            if debug:
                statements.append(idx_st)
            else:
                try:
                    cs.execute(idx_st)
                except dberror, e:
                    raise Error('Cannot create index on column %s for table %s in database %s! (Reason: %s)' % (quote(ic), quote(self.name), quote(self.db_key), e))
Ejemplo n.º 19
0
    def get(self, columns='*', rows='*', filter_clause=None, debug=False):
        r"""
Perform query from the table under specified conditions and return corresponding
Cursor instance; the Cursor may be used immediately in straightforward manner or
may be wrapped in :class:`~kdvs.fw.DBResult.DBResult` instance.

Parameters
----------
columns : list/tuple/'*'
    list of column names that the quering will be performed from; if string '*'
    is specified instead, all columns will be queried; '*' by default

rows: list/tuple/'*'
    list of rows (i.e. list of values from designated ID column) that the
    quering will be performed for; if string '*' is specified instead, all rows
    (i.e. whole content of ID column) will be queried; '*' by default

filter_clause : string/None
    additional filtering conditions stated in the form of correct SQL WHERE
    clause suitable for underlying RDBMS; if None, no additional filtering is
    added; None by default

debug : boolean
    provides debug mode for table querying; if True, collect all SQL statements
    produced by underlying RDBMS and return them as list of strings; if False,
    return None; False by default; NOTE: for this method, debug mode DOES NOT
    perform any physical querying, it just produces underlyng SQL statements
    and returns them

Returns
-------
cs/statements : Cursor/list of strings
    if debug mode was not requested: proper Cursor instance that may be used
    immediately or wrapped into DBResult object; if debug mode was requested:
    RDBMS SQL statements issued during table querying

Raises
------
Error
    if list/tuple of columns/rows was specified incorrectly
    if specified list of columns/rows is empty
    if table querying was interrupted with an error; essentially, reraise
    OperationalError from underlying RDBMS

See Also
--------
:pep:`249`
        """
        statements = []
        cs = self.db.cursor()
        dberror = self.dbm.provider.getOperationalError()
        # ---- resolve columns
        if columns == '*':
            cols_st = columns
        else:
            if isListOrTuple(columns):
                if len(columns) == 0:
                    raise Error('Non-empty list of columns expected!')
            else:
                raise Error('List or tuple expected! (got %s)' % columns.__class__)
            cols_st = ','.join([quote(c) for c in columns])
        # ---- resolve rows
        if rows != '*':
            if isListOrTuple(rows):
                if len(rows) > 0:
                    rs = tuple(rows)
                else:
                    raise Error('Non-empty list of rows expected!')
            else:
                raise Error('List or tuple expected! (got %s)' % rows.__class__)
            rows_st = ','.join([quote(r) for r in rs])
        else:
            rows_st = rows
        # ---- make statement
        if rows_st == '*':
            # resolve filter clause
            if filter_clause is not None:
                flt_cl = ' where %s' % filter_clause
            else:
                flt_cl = ''
            get_st = 'select %s from %s%s' % (cols_st, quote(self.name), flt_cl)
        else:
            # resolve filter clause
            if filter_clause is not None:
                flt_cl = ' and %s' % filter_clause
            else:
                flt_cl = ''
            get_st = 'select %s from %s where %s in (%s)%s' % (cols_st, quote(self.name), quote(self.id_column), rows_st, flt_cl)
        # ---- get content
        if debug:
            statements.append(get_st)
        else:
            try:
                cs.execute(get_st)
            except dberror, e:
                raise Error('Cannot select from table %s in database %s! (Reason: %s) (Cols: %s) (Rows: %s)' % (
                                quote(self.name), quote(self.db_key), e, columns, rows))
Ejemplo n.º 20
0
    def __init__(self,
                 dbm,
                 db_key,
                 filehandle,
                 dtname=None,
                 delimiter=None,
                 comment=None,
                 header=None,
                 make_missing_ID_column=True):
        r"""
Parameters
----------
dbm : :class:`~kdvs.core.db.DBManager`
    an instance of DB manager that is managing this table

db_key : string
    internal ID of the table used by DB manager instance; it is NOT the name of
    physical database table in underlying RDBMS; typically, user of DB manager
    refers to the table by this ID and not by its physical name

filehandle : file--like
    file handle to associated DSV file that contains the data that DSV table
    will hold; the file remains open but the data loading is deferred until
    requested

dtname : string/None
    physical name of database table in underlying RDBMS; if None, the name is
    generated semi--randomly; NOTE: ordinary user of DB manager shall refer to
    the table with 'db_key' ID

delimiter : string/None
    delimiter string of length 1 that should be used for parsing of DSV data;
    if None, the constructor tries to deduce delimiter by looking into first 10
    lines of associated DSV file; None by default; NOTE: giving explicit delimiter
    instead of deducing it dynamically greatly reduces possibility of errors
    during parsing DSV data

comment : string/None
    comment prefix used in associated DSV file, or None if comments are not used;
    None by default

header : list/tuple of string / None
    if header is present in the form of list/tuple of strings, it will be used
    as list of columns for the underlying database table; if None, the constructor
    tries to deduce the correct header by looking into first two lines of
    associated DSV file; None by default; NOTE: for well formed DSV files, header
    should be present, so it is relatively safe to deduce it automatically

make_missing_ID_column : boolean
    used in connection with previous argument; sometimes one can encounter DSV
    files that contain NO first column name in the header (e.g. generated from
    various R functions), and while they contain correct data, such files are
    syntactically incorrect; if the constructor sees lack of the first column name,
    it can proceed according to this parameter; if True, it inserts the content
    of :data:`DSV_DEFAULT_ID_COLUMN` variable as the missing column name; if False, it
    inserts empty string "" as the missing column name; True by default

Raises
------
Error
    if proper comment string was not specified
Error
    if underlying DSV dialect of associated DSV file has not been resolved correctly
Error
    if delimiter has not been specified correctly
Error
    if header iterable has not been specified correctly
Error
    if parsing of DSV data during deducing was interrupted with an error; essentially,
    it reraises underlying csv.Error

See Also
--------
csv
        """
        # ---- resolve comment
        if comment is not None and not isinstance(comment, basestring):
            raise Error('String or None expected! (got %s)' % comment)
        else:
            self.comment = comment
        # ---- resolve delimiter and dialect
        if delimiter is None:
            self._resolve_dialect(filehandle)
        else:
            if isinstance(delimiter, basestring) and len(delimiter) == 1:
                self.delimiter = delimiter
                try:
                    self.dialect = _dialects[delimiter]
                except KeyError:
                    raise Error(
                        'Dialect not identified for delimiter %s! (Sniffing required?)'
                        % quote(delimiter))
            else:
                raise Error('Single character expected! (got %s)' %
                            (delimiter))
        # ---- resolve header
        if header is None:
            self._extract_header(filehandle, make_missing_ID_column)
        else:
            if isListOrTuple(header):
                if len(header) == 0:
                    self._autogenerate_header(filehandle,
                                              make_missing_ID_column)
                else:
                    self._verify_header(filehandle, header)
                    self.header = header
            else:
                raise Error('List or tuple expected! (got %s)' %
                            header.__class__)
        # ---- DSV analysis finished, initialize underlying instance
        self.handle = filehandle
        super(DSV, self).__init__(dbm, db_key, self.header, dtname)
Ejemplo n.º 21
0
 def _parse(self, xml_fh):
     # used pattern for parsing large files, as documented in:
     # http://effbot.org/zone/element-iterparse.htm
     #
     # ---- get context iterator
     context = iter(iterparse(xml_fh, events=('start', 'end')))
     # ---- get root element
     event, root = context.next()
     # ---- check root tag
     root_tag = root.tag
     if root_tag != self.root_tag:
         raise Error(
             'Could not recognize XML dialect of the release! (got root tag: %s, should be: %s)'
             % (quote(root_tag), quote(self.root_tag)))
     # ---- process other elements
     for event, item in context:
         # ---- process header
         if event == 'end' and item.tag.endswith('header'):
             # ---- get format ID
             self._format = self._findSingleChildText(
                 item, 'format-version')
             # ---- get release date
             rel_date = self._findSingleChildText(item, 'date')
             self._release_date = datetime.strptime(rel_date,
                                                    "%d:%m:%Y %H:%M")
             item.clear()
             root.clear()
         # ---- process individual terms
         if event == 'end' and item.tag.endswith('term'):
             # ---- get term acc
             term = self._findSingleChildText(item, 'id')
             self.terms[term] = dict()
             # ---- get term name
             name = self._findSingleChildText(item, 'name')
             self.terms[term]['name'] = name
             # ---- get term namespace
             namespace = self._findSingleChildText(item, 'namespace')
             self.domain2concepts[namespace] = term
             # ---- get term description
             desc = self._findSingleChildText(item, 'def/defstr')
             self.terms[term]['desc'] = desc
             # ---- get obsolence
             obs_el = item.find('is_obsolete')
             if obs_el is not None and obs_el.text == '1':
                 self.terms[term]['obsolete'] = True
                 is_obsolete = 1
                 obs_el.clear()
             else:
                 self.terms[term]['obsolete'] = False
                 is_obsolete = 0
             # ---- process further if term not obsolete
             if is_obsolete == 0:
                 # ---- get alt ids
                 alt_id_els = item.findall('alt_id')
                 for alt_id_el in alt_id_els:
                     alt_id = alt_id_el.text
                     self.synonyms[term] = alt_id
                     alt_id_el.clear()
                 # ---- get relationships
                 # is_a
                 is_a_els = item.findall('is_a')
                 for is_a_el in is_a_els:
                     parent = is_a_el.text
                     if 'is_a' in self.recognizedRelations:
                         self.termsRelationsHierarchy['is_a'][parent] = term
                         self.termsPlainHierarchy[parent] = term
                     is_a_el.clear()
                 # other relationships
                 rel_els = item.findall('relationship')
                 for rel_el in rel_els:
                     rel_type = self._findSingleChildText(rel_el, 'type')
                     rel_dst = self._findSingleChildText(rel_el, 'to')
                     if rel_type in self.recognizedRelations:
                         self.termsRelationsHierarchy[rel_type][
                             term] = rel_dst
                         self.termsPlainHierarchy[term] = rel_dst
                     rel_el.clear()
             item.clear()
             root.clear()