Ejemplo n.º 1
0
Archivo: db.py Proyecto: vamin/MESS.DB
 def __init__(self, **kwargs):
     """Initialize attributes and db connection.
     
     Args:
         All kwargs will be passed to sqlite3.connect()
     """
     self.tries = 0
     self.max_tries = 3
     self.check_version()
     self.log_console = Log('console')
     self.log_all = Log('all')
     self.open(**kwargs)
Ejemplo n.º 2
0
 def __init__(self):
     """Initialize db cursor.
     
     Args:
         db (obj): A MessDB object
     """
     self.db = MessDB()
     self.log = Log('all')
     self.source_dir = None
     self.id = None
     self.name = None
     self.dirname = None
     self.url = None
     self.url_template = None
     self.last_update = None
Ejemplo n.º 3
0
def main():
    """Parse args, load the user-specified tool and execute it."""
    check_dependencies()
    toolmanager = ToolManager()
    parser = argparse.ArgumentParser(
        description="A collection of tools for interacting with MESS.DB", formatter_class=CustomArgparseFormatter
    )
    parser.add_argument(
        "-v", "--verbose", action="store_true", help=("increase output verbosity to include " "debugging messages")
    )
    toolmanager.populate_parser(parser)
    args = parser.parse_args()
    log = Log("console")
    log.setup(MESS_DIR, args.verbose)
    log.debug(get_mem_usage())
    tool = toolmanager.load_tool(args.subparser_name)
    tool.execute(args)
    log.debug(get_mem_usage())
Ejemplo n.º 4
0
 def __init__(self):
     """Set up db, check for attributes, dependencies, and setup."""
     self.db = MessDB()
     self.path = MethodPath()
     self.log_console = Log('console')
     self.log_all = Log('all')
     self.method_name = self.get_method_name()
     try:
         self.parameters
         self.description
         self.geop  # flag indicates method results in new xyz coordinates
         self.prog_name
         self.prog_version
         self.prog_url
     except AttributeError as err:
         print(''.join([str(err), '\n']), file=sys.stderr)
         sys.exit(('Each method class needs to define description, geop, '
                   'prog_name, prog_version, prog_url, '
                   'parameters as attributes.'))
     self.check_dependencies()
Ejemplo n.º 5
0
class Source(object):
    """This class provides methods for importing sources and maintaining source
    information in mess.db and in the molecule directories.
    
    Attributes:
        db (obj): A MessDB object
        log (obj): A Log('all') object
        source_dir (str): A path to the source directory
        id (int): The source_id in the mess.db source table
        name (str): A name for the source
        dirname (str): The name of the source subdirectory in the 'sources'
                       directory
        url (str): Url where the source can be downloaded
        url_template (str): A url template that can be used to go to the web
                            page for a particular molecule in a source catalog
        last_update (str): Date when source was last downloaded
    """
    def __init__(self):
        """Initialize db cursor.
        
        Args:
            db (obj): A MessDB object
        """
        self.db = MessDB()
        self.log = Log('all')
        self.source_dir = None
        self.id = None
        self.name = None
        self.dirname = None
        self.url = None
        self.url_template = None
        self.last_update = None
    
    @classmethod
    def get_sources(cls):
        """Returns a dictionary of source basenames mapped to source paths."""
        source_dir = os.path.join(os.path.dirname(__file__), '../sources/')
        sources = {}
        for root, _, filenames in os.walk(source_dir):
            for filename in [f for f in filenames
                             if f.endswith('.ini')
                             and not (f.startswith('.') or f.startswith('~'))]:
                ini_basename = os.path.splitext(filename)[0]
                if ini_basename == os.path.basename(root):
                    sources[ini_basename] = os.path.abspath(root)
        return sources
    
    def files(self):
        """Returns a list of importable files in the source directory."""
        return [f for f in os.listdir(self.source_dir)
                if not (f.startswith('.')
                        or f.startswith('~')
                        or f.endswith('.txt'))
                and f.split('.')[-1] in pybel.informats]
    
    def setup(self, source):
        """Setup source in mess.db.
        
        Args:
            source: A path to a source directory or a source basename.
        """
        source_basename = os.path.basename(source.rstrip(os.sep))
        if source_basename.endswith('.ini'):
            source_basename = os.path.splitext(source_basename)[0]
        sources = self.get_sources()
        if source_basename not in sources:
            sys.exit(("All sources must reside in the 'sources' directory, "
                      'read sources/SOURCES_README.md for details.'))
        self.source_dir = sources[source_basename]
        source_ini = os.path.join(self.source_dir, '%s.ini' % source_basename)
        source_attributes = self.parse_ini(source_ini)
        # insert/update source in the database
        total_changes = self.db.total_changes
        insert_query = ('INSERT OR IGNORE INTO source '
                        '(name, dirname, url, url_template, '
                        'citation, last_update) '
                        'VALUES (?, ?, null, null, null, null)')
        update_query = ('UPDATE source '
                        'SET url=?, url_template=?, citation=?, last_update=? '
                        'WHERE dirname=?;')
        self.db.execute(insert_query, (source_attributes['name'],
                                       source_basename))
        self.db.execute(update_query, (source_attributes['url'],
                                       source_attributes['url_template'],
                                       source_attributes['citation'],
                                       source_attributes['last_update'],
                                       source_basename))
        if self.db.total_changes - total_changes > 1:
            self.log.info('%s added to sources in database', source_basename)
        select_query = ('SELECT source_id, name, dirname, '
                        'url, url_template, last_update '
                        'FROM source WHERE dirname=?')
        source_row = self.db.execute(select_query,
                                     (source_basename,)).fetchone()
        # set attributes
        self.id = source_row.source_id
        self.name = source_row.name
        self.dirname = source_row.dirname
        self.url = source_row.url
        self.url_template = source_row.url_template
        self.last_update = source_row.last_update
    
    def parse_ini(self, ini):
        """Parse source ini and return attributes."""
        source_attributes = {
            'url': None,
            'url_template': None,
            'citation': None
        }
        config = cp.ConfigParser(dict_type=CaseInsensitiveDict)
        config.read(ini)
        for section in config.sections():
            for option in config.options(section):
                source_attributes[option] = unicode_replace(config.get(section,
                                                                       option))
        required_attributes = ('name', 'last_update')
        if not all(att in source_attributes for att in required_attributes):
            sys.exit('Source INI missing required attributes: %s.'
                     % ' and/or '.join(required_attributes))
        return source_attributes
    
    def update_molecule_source_query(self, inchikey, identifier):
        """Update the source in mess.db.
        
        Args:
            inchikey: A molecule InChIKey.
            identifier: A source identifier (usually a catalog number).
        """
        query = ('INSERT OR IGNORE INTO molecule_source '
                 '(inchikey, source_id, identifier) '
                 'VALUES (?, ?, ?)')
        return (query, (inchikey, self.id, identifier))
    
    def update_source_tsv(self, inchikey, identifier):
        """Update the sources.tsv file.
        
        Args:
            inchikey_dir: Dir to a molecule in the molecules dir.
            identifier: A source identifier (usually a catalog number).
        
        """
        inchikey_dir = get_inchikey_dir(inchikey)
        name = self.name.encode('ascii', 'replace')
        dirname = self.dirname.encode('ascii', 'replace')
        identifier = identifier.encode('ascii', 'replace')
        sources_tsv = os.path.join(inchikey_dir, '%s.sources.tsv' % inchikey)
        with codecs.open(sources_tsv, 'r', 'ascii') as sources_in:
            with codecs.open(sources_tsv, 'a', 'ascii') as sources_out:
                sources_in = csv.reader(sources_in, delimiter=b'\t')
                sources_out = csv.writer(sources_out, delimiter=b'\t')
                # check if source has been recorded
                source_present = False
                for row in sources_in:
                    try:
                        if row[1] == dirname and row[2] == identifier:
                            source_present = True
                    except IndexError:
                        pass
                if not source_present:
                    if self.url_template:
                        url_split = re.split(r"\[|\]", self.url_template)
                        (match, replace) = re.split(r",\s?", url_split[1])
                        url_identifier = re.sub(match, replace, identifier)
                        source_url = url_split[0] + url_identifier
                        if 2 < len(url_split):
                            source_url += url_split[2]
                    else:
                        source_url = ''
                    sources_out.writerow([name, dirname, identifier,
                                          source_url.encode('ascii',
                                                            'replace')])
                    self.log.inchikey = inchikey
                    self.log.info('%s added to %s sources', name, inchikey)
                    self.log.inchikey = None
Ejemplo n.º 6
0
Archivo: db.py Proyecto: vamin/MESS.DB
class MessDB(object):
    """Manage a connection to mess.db.
    
    Attributes:
        tries (int): number of times a db commit has failed
        max_tries (int): maximum number of times to try db commit
        conn (obj): sqlite3 db connection object
        total_changes (int): number of rows modified/added/removed since open
    """
    def __init__(self, **kwargs):
        """Initialize attributes and db connection.
        
        Args:
            All kwargs will be passed to sqlite3.connect()
        """
        self.tries = 0
        self.max_tries = 3
        self.check_version()
        self.log_console = Log('console')
        self.log_all = Log('all')
        self.open(**kwargs)
    
    def __del__(self):
        """On deletion, commit transations and close the connection."""
        try:
            self.close()
        except sqlite3.ProgrammingError:
            pass  # there was no db connection
    
    @property
    def total_changes(self):
        """Returns the total number of database rows that have been modified,
        inserted, or deleted since the db connection was opened."""
        return self.conn.total_changes
    
    @classmethod
    def namedtuple_factory(cls, cursor, row):
        """
        Usage:
            conn.row_factory = namedtuple_factory
        """
        fields = [col[0] for col in cursor.description]
        row_tuple = namedtuple('Row', fields)
        return row_tuple(*row)
    
    @classmethod
    def check_version(cls):
        """Ensure that all dependencies are satisfied."""
        if LooseVersion(sqlite3.sqlite_version) < LooseVersion('3.7.0'):
            sys.exit(('The database requres SQLite version 3.7 or greater '
                      'due to the use of WAL mode.'))
        return sqlite3.sqlite_version
    
    def check_tables(self):
        """Check that mess.db has the right number of tables in it."""
        query = 'SELECT count(*) count FROM sqlite_master WHERE type=?'
        result = self.cursor().execute(query, ('table',)).next()
        if result.count != 14:
            return False
        return True
    
    def initialize(self):
        """Load the mess.db schema."""
        tables = os.path.join(os.path.dirname(__file__),
                              '../db/schema/tables.sql')
        views = os.path.join(os.path.dirname(__file__),
                             '../db/schema/views.sql')
        triggers = os.path.join(os.path.dirname(__file__),
                                '../db/schema/triggers.sql')
        self.executescript(codecs.open(tables, encoding='utf-8').read())
        self.executescript(codecs.open(views, encoding='utf-8').read())
        self.executescript(codecs.open(triggers, encoding='utf-8').read())
        result = self.execute('PRAGMA journal_mode=wal').next()
        if not result.journal_mode == 'wal':
            self.log_console.warning(('Setting journal mode to WAL failed. '
                                      'Run PRAGMA  journal_mode=wal in SQLite '
                                      'before continuing.'))
        self.log_all.info('new mess.db initialized')
    
    def open(self, **kwargs):
        """Open a connection to db/mess.db and set up row factory."""
        try:
            if 'database' not in kwargs:
                kwargs['database'] = os.path.join(os.path.dirname(__file__),
                                                  '../db/mess.db')
            if 'timeout' not in kwargs:
                kwargs['timeout'] = 120
            self.conn = sqlite3.connect(**kwargs)
        except IOError:
            sys.exit('could not find/create mess.db')
        self.conn.row_factory = self.namedtuple_factory
        if not self.check_tables():
            self.initialize()
    
    def reopen(self):
        """Commit, close, and open db connection."""
        self.close()
        self.open()
    
    def close(self):
        """Close the db connection."""
        self.commit()
        self.conn.close()
    
    def cursor(self):
        """Return cursor."""
        return self.conn.cursor()
    
    def execute(self, query, values=()):
        """Execute a single query."""
        try:
            with self.conn:
                return self.conn.execute(query, values)
        except sqlite3.Error as err:
            self.log_console.error('%s failed with values %s\n%s',
                                   query, values, err)
            
    def executemany(self, query, values):
        """Execute a single query with many values."""
        try:
            with self.conn:
                return self.conn.executemany(query, values)
        except sqlite3.Error as err:
            self.log_console.error('%s failed with many values\n%s',
                                   query, err)
    
    def executescript(self, script):
        """Read and execute queries from file."""
        try:
            with self.conn:
                return self.conn.executescript(script)
        except sqlite3.Error as err:
            self.log_console.error('script %s failed\n%s', script, err)
    
    def commit(self):
        """Commit transactions to db."""
        while self.tries < self.max_tries:
            try:
                return self.conn.commit()
            except sqlite3.OperationalError:
                self.log_console.info('database is locked, trying again')
                self.tries += 1
        sys.exit('Database is still locked after %d tries.' % self.tries)
Ejemplo n.º 7
0
class AbstractMethod(object):
    """All methods must inherit from this class.
    
    Attributes:
        db (obj): A MessDB object
        method_name (str): The name of the method
        description (str): Description of method
        geop (bool): Whether the method generates a new geometry
        prog_name (str): Program name
        prog_version (str): Program version
        prog_url (str): Program url
        parameters (dict): Parameters that affect program execution
    """
    parameters = dict()
    shortdesc = None
    method_citation = None
    prog_citation = None
    _inchikey = None
    _path_id = None
    _parent_path_id = None
    _method_dir = None
    _parent_method_dir = None
    
    def __init__(self):
        """Set up db, check for attributes, dependencies, and setup."""
        self.db = MessDB()
        self.path = MethodPath()
        self.log_console = Log('console')
        self.log_all = Log('all')
        self.method_name = self.get_method_name()
        try:
            self.parameters
            self.description
            self.geop  # flag indicates method results in new xyz coordinates
            self.prog_name
            self.prog_version
            self.prog_url
        except AttributeError as err:
            print(''.join([str(err), '\n']), file=sys.stderr)
            sys.exit(('Each method class needs to define description, geop, '
                      'prog_name, prog_version, prog_url, '
                      'parameters as attributes.'))
        self.check_dependencies()
    
    def __hash__(self):
        """Hash based on method name and parameters.
        
        Returns:
            A hex string of the sha1 hash of self.method_name plus
            JSON-serialized self.parameters. Keys are sorted.
        """
        return hashlib.sha1(self.method_name +
                            json.dumps(dict((str(k).lower(),
                                             str(v).lower())
                                            for k, v
                                            in self.parameters.iteritems()),
                                       sort_keys=True)).hexdigest()
    
    @property
    def hash(self):
        """Get hash."""
        return self.__hash__()
    
    @property
    def method_id(self):
        """Get the object's method_id attribute."""
        query = ('SELECT method_id FROM method '
                 'WHERE hash = ?;')
        row = self.db.execute(query, (self.hash,)).fetchone()
        return row.method_id
    
    @property
    def path_id(self):
        """Get the path id of the method."""
        if not self.path.get_method_id() == self.method_id:
            self._setup_path()
        return self._path_id
    
    @property
    def method_dir(self):
        """Get the directory name of the method."""
        if not self.path.get_method_id() == self.method_id:
            self._setup_path()
        return self._method_dir
    
    @property
    def parent_method_dir(self):
        """Get the parent directory name of the method."""
        if not self.path.get_method_id() == self.method_id:
            self._setup_path()
        return self._parent_method_dir
    
    @property
    def inchikey(self):
        """Get inchikey."""
        return self._inchikey
    
    @inchikey.setter
    def inchikey(self, inchikey):
        """Set inchikey, and update inchikey of logger."""
        if inchikey is not None and not is_inchikey(inchikey):
            raise RuntimeError('invalid inchikey: %s' % inchikey)
        self._inchikey = inchikey
        self.log_all.inchikey = inchikey
    
    @classmethod
    def get_method_name(cls):
        """Return the name of the method, derived from the subclass name."""
        return cls.__name__.replace('_', '').lower()
    
    def _setup_path(self):
        """Setup path given current method id and parent path."""
        self.path.setup_path(self.method_id, self._parent_path_id)
        self._path_id = self.path.get_path_id()
        self._method_dir = self.path.get_path_directory()
        self._parent_method_dir = self.path.get_parent_path_directory()
    
    def _insert_method(self):
        """Set insert program to db, set up hash, and insert method to db."""
        total_changes = self.db.total_changes
        query = ('INSERT OR IGNORE INTO method '
                 '(program_id, geop, name, shortdesc, citation, hash) '
                 'SELECT program.program_id, ?, ?, ?, ?, ? '
                 'FROM program '
                 'WHERE program.name=? AND program.version=?')
        self.db.execute(query, (self.geop, self.method_name, self.shortdesc,
                                self.method_citation, self.hash,
                                self.prog_name, self.prog_version))
        if self.db.total_changes - total_changes > 0:
            self.log_all.info('new %s method added to MESS.DB',
                              self.method_name)
    
    def _insert_program(self):
        """Adds row to program table in mess.db."""
        total_changes = self.db.total_changes
        query = ('INSERT OR IGNORE INTO program '
                 '(name, version, url, citation) '
                 'VALUES (?, ?, ?, ?)')
        self.db.execute(query,
                        (self.prog_name, self.prog_version, self.prog_url,
                         self.prog_citation))
        if self.db.total_changes - total_changes > 0:
            self.log_all.info('program %s %s added to MESS.DB',
                              self.prog_name, self.prog_version)
    
    def _insert_parameters(self):
        """Import paramaters dict to mess.db.
        
        Args:
            name: Name of parameter.
            setting: The value the parameter is set to.
        """
        added_parameters = 0
        for name, setting in self.parameters.items():
            query = ('INSERT OR IGNORE INTO parameter (name) VALUES (?)')
            self.db.execute(query, (name, ))
            total_changes = self.db.total_changes
            query = ('INSERT OR IGNORE INTO method_parameter '
                     '(method_id, parameter_id, setting) '
                     'SELECT ?, parameter.parameter_id, ? '
                     'FROM program, parameter '
                     'WHERE parameter.name=?')
            self.db.execute(query, (self.method_id, setting, name))
            added_parameters += (self.db.total_changes - total_changes)
        if added_parameters > 0:
            self.log_all.info('%i method parameters added to MESS.DB',
                              added_parameters)
    
    def get_insert_property_query(self, inchikey, name, description,
                                  format_, value, units=''):
        """Returns query to insert property value to mess.db.
        
        Args:
            inchikey: The inchikey of a molecule in MESS.DB.
            method_path_id: Path id for the calculations that generated the
                            property.
            name: The property name.
            description: A description of the property.
            format_: A description of the format the property is in.
            value: The calculated property.
            units: Units for the property value.
        """
        query = ('INSERT OR IGNORE INTO molecule_method_property_denorm '
                 'VALUES (?, ?, ?, ?, ?, ?, ?);')
        return (query, (inchikey, self.path_id, name, description,
                        format_, units, value))
    
    def get_insert_moldata_queries(self, inchikey, mol,
                                   description='', units=''):
        """Returns queries to insert molecule data values to mess.db."""
        for name, value in mol.data.iteritems():
            yield self.get_insert_property_query(inchikey,
                                                 name,
                                                 description,
                                                 type(value).__name__,
                                                 value,
                                                 units)
    
    def get_timing_query(self, inchikey, start):
        """Get a query to insert execution time property into db."""
        return self.get_insert_property_query(inchikey, 'runtime',
                                              'execution time',
                                              type(start).__name__,
                                              time.time() - start, 's')
    
    def set_parent_path(self, parent_path):
        """Set the parent path (e.g., path to method containing input
        geometry.)"""
        if parent_path > 0:
            self._parent_path_id = parent_path
    
    def has_parent_path(self, inchikey):
        """Returns True if molecule has had entire parent path calculated,
        False otherwise."""
        query = ('SELECT inchikey FROM molecule_method_property WHERE '
                 'inchikey = ? AND method_path_id = ?')
        try:
            self.db.execute(query,
                            (inchikey, self._parent_path_id)).fetchone()[0]
            return True
        except TypeError:
            return False
    
    def check_dependencies(self):
        """If check_dependencies is not implemented, raise error."""
        raise NotImplementedError(("every method needs a 'check_dependencies' "
                                   'method'))
    
    def check(self):
        """If check is not implemented, raise error."""
        # the check method should be called before a calculation (so
        # calculations are not repeated) and after (to verify success)
        raise NotImplementedError("every method needs a 'check' method")
    
    def map(self, inchikey, inchikey_dir):
        """Generally, maps molecule to calculation via method, emits
        query/value pairs.
        """
        raise NotImplementedError(("every method needs a 'map' method"))
    
    def reduce(self, query, values):
        """Run queries/values on the db."""
        total_changes = self.db.total_changes
        if query or values[0]:
            self.db.executemany(query, values)
            self.log_all.info('%i properties added to MESS.DB',
                              self.db.total_changes - total_changes)
            total_changes = self.db.total_changes
    
    def setup(self):
        """Set up method."""
        self._insert_program()
        self._insert_method()
        self._insert_parameters()