class MessDB(object): """Manage a connection to mess.db. Attributes: tries (int): number of times a db commit has failed max_tries (int): maximum number of times to try db commit conn (obj): sqlite3 db connection object total_changes (int): number of rows modified/added/removed since open """ def __init__(self, **kwargs): """Initialize attributes and db connection. Args: All kwargs will be passed to sqlite3.connect() """ self.tries = 0 self.max_tries = 3 self.check_version() self.log_console = Log('console') self.log_all = Log('all') self.open(**kwargs) def __del__(self): """On deletion, commit transations and close the connection.""" try: self.close() except sqlite3.ProgrammingError: pass # there was no db connection @property def total_changes(self): """Returns the total number of database rows that have been modified, inserted, or deleted since the db connection was opened.""" return self.conn.total_changes @classmethod def namedtuple_factory(cls, cursor, row): """ Usage: conn.row_factory = namedtuple_factory """ fields = [col[0] for col in cursor.description] row_tuple = namedtuple('Row', fields) return row_tuple(*row) @classmethod def check_version(cls): """Ensure that all dependencies are satisfied.""" if LooseVersion(sqlite3.sqlite_version) < LooseVersion('3.7.0'): sys.exit(('The database requres SQLite version 3.7 or greater ' 'due to the use of WAL mode.')) return sqlite3.sqlite_version def check_tables(self): """Check that mess.db has the right number of tables in it.""" query = 'SELECT count(*) count FROM sqlite_master WHERE type=?' result = self.cursor().execute(query, ('table',)).next() if result.count != 14: return False return True def initialize(self): """Load the mess.db schema.""" tables = os.path.join(os.path.dirname(__file__), '../db/schema/tables.sql') views = os.path.join(os.path.dirname(__file__), '../db/schema/views.sql') triggers = os.path.join(os.path.dirname(__file__), '../db/schema/triggers.sql') self.executescript(codecs.open(tables, encoding='utf-8').read()) self.executescript(codecs.open(views, encoding='utf-8').read()) self.executescript(codecs.open(triggers, encoding='utf-8').read()) result = self.execute('PRAGMA journal_mode=wal').next() if not result.journal_mode == 'wal': self.log_console.warning(('Setting journal mode to WAL failed. ' 'Run PRAGMA journal_mode=wal in SQLite ' 'before continuing.')) self.log_all.info('new mess.db initialized') def open(self, **kwargs): """Open a connection to db/mess.db and set up row factory.""" try: if 'database' not in kwargs: kwargs['database'] = os.path.join(os.path.dirname(__file__), '../db/mess.db') if 'timeout' not in kwargs: kwargs['timeout'] = 120 self.conn = sqlite3.connect(**kwargs) except IOError: sys.exit('could not find/create mess.db') self.conn.row_factory = self.namedtuple_factory if not self.check_tables(): self.initialize() def reopen(self): """Commit, close, and open db connection.""" self.close() self.open() def close(self): """Close the db connection.""" self.commit() self.conn.close() def cursor(self): """Return cursor.""" return self.conn.cursor() def execute(self, query, values=()): """Execute a single query.""" try: with self.conn: return self.conn.execute(query, values) except sqlite3.Error as err: self.log_console.error('%s failed with values %s\n%s', query, values, err) def executemany(self, query, values): """Execute a single query with many values.""" try: with self.conn: return self.conn.executemany(query, values) except sqlite3.Error as err: self.log_console.error('%s failed with many values\n%s', query, err) def executescript(self, script): """Read and execute queries from file.""" try: with self.conn: return self.conn.executescript(script) except sqlite3.Error as err: self.log_console.error('script %s failed\n%s', script, err) def commit(self): """Commit transactions to db.""" while self.tries < self.max_tries: try: return self.conn.commit() except sqlite3.OperationalError: self.log_console.info('database is locked, trying again') self.tries += 1 sys.exit('Database is still locked after %d tries.' % self.tries)
class Source(object): """This class provides methods for importing sources and maintaining source information in mess.db and in the molecule directories. Attributes: db (obj): A MessDB object log (obj): A Log('all') object source_dir (str): A path to the source directory id (int): The source_id in the mess.db source table name (str): A name for the source dirname (str): The name of the source subdirectory in the 'sources' directory url (str): Url where the source can be downloaded url_template (str): A url template that can be used to go to the web page for a particular molecule in a source catalog last_update (str): Date when source was last downloaded """ def __init__(self): """Initialize db cursor. Args: db (obj): A MessDB object """ self.db = MessDB() self.log = Log('all') self.source_dir = None self.id = None self.name = None self.dirname = None self.url = None self.url_template = None self.last_update = None @classmethod def get_sources(cls): """Returns a dictionary of source basenames mapped to source paths.""" source_dir = os.path.join(os.path.dirname(__file__), '../sources/') sources = {} for root, _, filenames in os.walk(source_dir): for filename in [f for f in filenames if f.endswith('.ini') and not (f.startswith('.') or f.startswith('~'))]: ini_basename = os.path.splitext(filename)[0] if ini_basename == os.path.basename(root): sources[ini_basename] = os.path.abspath(root) return sources def files(self): """Returns a list of importable files in the source directory.""" return [f for f in os.listdir(self.source_dir) if not (f.startswith('.') or f.startswith('~') or f.endswith('.txt')) and f.split('.')[-1] in pybel.informats] def setup(self, source): """Setup source in mess.db. Args: source: A path to a source directory or a source basename. """ source_basename = os.path.basename(source.rstrip(os.sep)) if source_basename.endswith('.ini'): source_basename = os.path.splitext(source_basename)[0] sources = self.get_sources() if source_basename not in sources: sys.exit(("All sources must reside in the 'sources' directory, " 'read sources/SOURCES_README.md for details.')) self.source_dir = sources[source_basename] source_ini = os.path.join(self.source_dir, '%s.ini' % source_basename) source_attributes = self.parse_ini(source_ini) # insert/update source in the database total_changes = self.db.total_changes insert_query = ('INSERT OR IGNORE INTO source ' '(name, dirname, url, url_template, ' 'citation, last_update) ' 'VALUES (?, ?, null, null, null, null)') update_query = ('UPDATE source ' 'SET url=?, url_template=?, citation=?, last_update=? ' 'WHERE dirname=?;') self.db.execute(insert_query, (source_attributes['name'], source_basename)) self.db.execute(update_query, (source_attributes['url'], source_attributes['url_template'], source_attributes['citation'], source_attributes['last_update'], source_basename)) if self.db.total_changes - total_changes > 1: self.log.info('%s added to sources in database', source_basename) select_query = ('SELECT source_id, name, dirname, ' 'url, url_template, last_update ' 'FROM source WHERE dirname=?') source_row = self.db.execute(select_query, (source_basename,)).fetchone() # set attributes self.id = source_row.source_id self.name = source_row.name self.dirname = source_row.dirname self.url = source_row.url self.url_template = source_row.url_template self.last_update = source_row.last_update def parse_ini(self, ini): """Parse source ini and return attributes.""" source_attributes = { 'url': None, 'url_template': None, 'citation': None } config = cp.ConfigParser(dict_type=CaseInsensitiveDict) config.read(ini) for section in config.sections(): for option in config.options(section): source_attributes[option] = unicode_replace(config.get(section, option)) required_attributes = ('name', 'last_update') if not all(att in source_attributes for att in required_attributes): sys.exit('Source INI missing required attributes: %s.' % ' and/or '.join(required_attributes)) return source_attributes def update_molecule_source_query(self, inchikey, identifier): """Update the source in mess.db. Args: inchikey: A molecule InChIKey. identifier: A source identifier (usually a catalog number). """ query = ('INSERT OR IGNORE INTO molecule_source ' '(inchikey, source_id, identifier) ' 'VALUES (?, ?, ?)') return (query, (inchikey, self.id, identifier)) def update_source_tsv(self, inchikey, identifier): """Update the sources.tsv file. Args: inchikey_dir: Dir to a molecule in the molecules dir. identifier: A source identifier (usually a catalog number). """ inchikey_dir = get_inchikey_dir(inchikey) name = self.name.encode('ascii', 'replace') dirname = self.dirname.encode('ascii', 'replace') identifier = identifier.encode('ascii', 'replace') sources_tsv = os.path.join(inchikey_dir, '%s.sources.tsv' % inchikey) with codecs.open(sources_tsv, 'r', 'ascii') as sources_in: with codecs.open(sources_tsv, 'a', 'ascii') as sources_out: sources_in = csv.reader(sources_in, delimiter=b'\t') sources_out = csv.writer(sources_out, delimiter=b'\t') # check if source has been recorded source_present = False for row in sources_in: try: if row[1] == dirname and row[2] == identifier: source_present = True except IndexError: pass if not source_present: if self.url_template: url_split = re.split(r"\[|\]", self.url_template) (match, replace) = re.split(r",\s?", url_split[1]) url_identifier = re.sub(match, replace, identifier) source_url = url_split[0] + url_identifier if 2 < len(url_split): source_url += url_split[2] else: source_url = '' sources_out.writerow([name, dirname, identifier, source_url.encode('ascii', 'replace')]) self.log.inchikey = inchikey self.log.info('%s added to %s sources', name, inchikey) self.log.inchikey = None
class AbstractMethod(object): """All methods must inherit from this class. Attributes: db (obj): A MessDB object method_name (str): The name of the method description (str): Description of method geop (bool): Whether the method generates a new geometry prog_name (str): Program name prog_version (str): Program version prog_url (str): Program url parameters (dict): Parameters that affect program execution """ parameters = dict() shortdesc = None method_citation = None prog_citation = None _inchikey = None _path_id = None _parent_path_id = None _method_dir = None _parent_method_dir = None def __init__(self): """Set up db, check for attributes, dependencies, and setup.""" self.db = MessDB() self.path = MethodPath() self.log_console = Log('console') self.log_all = Log('all') self.method_name = self.get_method_name() try: self.parameters self.description self.geop # flag indicates method results in new xyz coordinates self.prog_name self.prog_version self.prog_url except AttributeError as err: print(''.join([str(err), '\n']), file=sys.stderr) sys.exit(('Each method class needs to define description, geop, ' 'prog_name, prog_version, prog_url, ' 'parameters as attributes.')) self.check_dependencies() def __hash__(self): """Hash based on method name and parameters. Returns: A hex string of the sha1 hash of self.method_name plus JSON-serialized self.parameters. Keys are sorted. """ return hashlib.sha1(self.method_name + json.dumps(dict((str(k).lower(), str(v).lower()) for k, v in self.parameters.iteritems()), sort_keys=True)).hexdigest() @property def hash(self): """Get hash.""" return self.__hash__() @property def method_id(self): """Get the object's method_id attribute.""" query = ('SELECT method_id FROM method ' 'WHERE hash = ?;') row = self.db.execute(query, (self.hash,)).fetchone() return row.method_id @property def path_id(self): """Get the path id of the method.""" if not self.path.get_method_id() == self.method_id: self._setup_path() return self._path_id @property def method_dir(self): """Get the directory name of the method.""" if not self.path.get_method_id() == self.method_id: self._setup_path() return self._method_dir @property def parent_method_dir(self): """Get the parent directory name of the method.""" if not self.path.get_method_id() == self.method_id: self._setup_path() return self._parent_method_dir @property def inchikey(self): """Get inchikey.""" return self._inchikey @inchikey.setter def inchikey(self, inchikey): """Set inchikey, and update inchikey of logger.""" if inchikey is not None and not is_inchikey(inchikey): raise RuntimeError('invalid inchikey: %s' % inchikey) self._inchikey = inchikey self.log_all.inchikey = inchikey @classmethod def get_method_name(cls): """Return the name of the method, derived from the subclass name.""" return cls.__name__.replace('_', '').lower() def _setup_path(self): """Setup path given current method id and parent path.""" self.path.setup_path(self.method_id, self._parent_path_id) self._path_id = self.path.get_path_id() self._method_dir = self.path.get_path_directory() self._parent_method_dir = self.path.get_parent_path_directory() def _insert_method(self): """Set insert program to db, set up hash, and insert method to db.""" total_changes = self.db.total_changes query = ('INSERT OR IGNORE INTO method ' '(program_id, geop, name, shortdesc, citation, hash) ' 'SELECT program.program_id, ?, ?, ?, ?, ? ' 'FROM program ' 'WHERE program.name=? AND program.version=?') self.db.execute(query, (self.geop, self.method_name, self.shortdesc, self.method_citation, self.hash, self.prog_name, self.prog_version)) if self.db.total_changes - total_changes > 0: self.log_all.info('new %s method added to MESS.DB', self.method_name) def _insert_program(self): """Adds row to program table in mess.db.""" total_changes = self.db.total_changes query = ('INSERT OR IGNORE INTO program ' '(name, version, url, citation) ' 'VALUES (?, ?, ?, ?)') self.db.execute(query, (self.prog_name, self.prog_version, self.prog_url, self.prog_citation)) if self.db.total_changes - total_changes > 0: self.log_all.info('program %s %s added to MESS.DB', self.prog_name, self.prog_version) def _insert_parameters(self): """Import paramaters dict to mess.db. Args: name: Name of parameter. setting: The value the parameter is set to. """ added_parameters = 0 for name, setting in self.parameters.items(): query = ('INSERT OR IGNORE INTO parameter (name) VALUES (?)') self.db.execute(query, (name, )) total_changes = self.db.total_changes query = ('INSERT OR IGNORE INTO method_parameter ' '(method_id, parameter_id, setting) ' 'SELECT ?, parameter.parameter_id, ? ' 'FROM program, parameter ' 'WHERE parameter.name=?') self.db.execute(query, (self.method_id, setting, name)) added_parameters += (self.db.total_changes - total_changes) if added_parameters > 0: self.log_all.info('%i method parameters added to MESS.DB', added_parameters) def get_insert_property_query(self, inchikey, name, description, format_, value, units=''): """Returns query to insert property value to mess.db. Args: inchikey: The inchikey of a molecule in MESS.DB. method_path_id: Path id for the calculations that generated the property. name: The property name. description: A description of the property. format_: A description of the format the property is in. value: The calculated property. units: Units for the property value. """ query = ('INSERT OR IGNORE INTO molecule_method_property_denorm ' 'VALUES (?, ?, ?, ?, ?, ?, ?);') return (query, (inchikey, self.path_id, name, description, format_, units, value)) def get_insert_moldata_queries(self, inchikey, mol, description='', units=''): """Returns queries to insert molecule data values to mess.db.""" for name, value in mol.data.iteritems(): yield self.get_insert_property_query(inchikey, name, description, type(value).__name__, value, units) def get_timing_query(self, inchikey, start): """Get a query to insert execution time property into db.""" return self.get_insert_property_query(inchikey, 'runtime', 'execution time', type(start).__name__, time.time() - start, 's') def set_parent_path(self, parent_path): """Set the parent path (e.g., path to method containing input geometry.)""" if parent_path > 0: self._parent_path_id = parent_path def has_parent_path(self, inchikey): """Returns True if molecule has had entire parent path calculated, False otherwise.""" query = ('SELECT inchikey FROM molecule_method_property WHERE ' 'inchikey = ? AND method_path_id = ?') try: self.db.execute(query, (inchikey, self._parent_path_id)).fetchone()[0] return True except TypeError: return False def check_dependencies(self): """If check_dependencies is not implemented, raise error.""" raise NotImplementedError(("every method needs a 'check_dependencies' " 'method')) def check(self): """If check is not implemented, raise error.""" # the check method should be called before a calculation (so # calculations are not repeated) and after (to verify success) raise NotImplementedError("every method needs a 'check' method") def map(self, inchikey, inchikey_dir): """Generally, maps molecule to calculation via method, emits query/value pairs. """ raise NotImplementedError(("every method needs a 'map' method")) def reduce(self, query, values): """Run queries/values on the db.""" total_changes = self.db.total_changes if query or values[0]: self.db.executemany(query, values) self.log_all.info('%i properties added to MESS.DB', self.db.total_changes - total_changes) total_changes = self.db.total_changes def setup(self): """Set up method.""" self._insert_program() self._insert_method() self._insert_parameters()