def map(self, mol, source): """Import molecule into MESS.DB.""" # setup local variables self.inchikey = mol.write('inchikey').rstrip() inchikey_dir = get_inchikey_dir(self.inchikey) inchikey_basename = os.path.join(inchikey_dir, self.inchikey) identifier = unicode(mol.title, 'utf-8', 'replace') # setup directory setup_dir(inchikey_dir) if not self.check(): mol.title = b'' mol.write('inchi', (inchikey_basename + '.inchi'), overwrite=True) if not os.path.exists(inchikey_basename + '.png'): mol.write('_png2', (inchikey_basename + '.png')) touch(inchikey_basename + '.log') touch(inchikey_basename + '.notes') touch(os.path.join(inchikey_dir, '%s.sources.tsv' % inchikey_basename)) self.log_all.info('%s molecule directory initialized', self.inchikey) source.update_source_tsv(self.inchikey, identifier) yield source.update_molecule_source_query(self.inchikey, identifier) yield self.insert_molecule_query(self.inchikey, mol) for query, values in self.get_insert_moldata_queries( self.inchikey, mol, description='molecule data from %s input' % source.dirname): yield query, values for query, values in self.get_openbabel_property_queries(self.inchikey, mol): yield query, values
def _to_all(self, inchikey=None): """Log to console, central log, and molecule log. Use to report things that change the database.""" if inchikey is not None and not is_inchikey(inchikey): sys.exit('invalid inchikey passed to logger') logger = logging.getLogger('mess') for handler in logger.handlers: try: if ('molecules/' in handler.baseFilename or '/dev/null' in handler.baseFilename): if inchikey is not None: logger.removeHandler(handler) mol_log = '%s/%s.log' % (get_inchikey_dir(inchikey), inchikey) logger.addHandler(logging.FileHandler(mol_log)) break elif '/dev/null' not in handler.baseFilename: logger.removeHandler(handler) logger.addHandler(logging.FileHandler('/dev/null')) break except AttributeError: continue if self.context is not None: return logging.getLogger('mess.%s' % self.context.lower()) else: return logging.getLogger('mess')
def execute(self, args): """Remove specified elements.""" db = MessDB() cur = db.cursor() for row in args.inchikeys: inchikey = row.split()[0].strip() try: inchikey_dir = get_inchikey_dir(inchikey) shutil.rmtree(inchikey_dir) self.log_all.info('%s dir removed', inchikey) except OSError: self.log_console.info('%s did not have a directory', inchikey) try: parent = os.path.relpath(os.path.join(inchikey_dir, '../')) os.removedirs(parent) except OSError: pass records = 0 query = 'DELETE from molecule WHERE inchikey=?' cur.execute(query, (inchikey,)) records += cur.rowcount query = 'DELETE from molecule_synonym WHERE inchikey=?' cur.execute(query, (inchikey,)) records += cur.rowcount query = 'DELETE from molecule_source WHERE inchikey=?' cur.execute(query, (inchikey,)) records += cur.rowcount query = ('DELETE from molecule_state_method_property ' 'WHERE inchikey=?') cur.execute(query, (inchikey,)) records += cur.rowcount db.commit() self.log_all.info('%i %s records removed from db', records, inchikey)
def map(self, mol, source): """Import molecule into MESS.DB.""" self.inchikey = mol.write('inchikey').rstrip() if not self.check(): inchikey_dir = get_inchikey_dir(self.inchikey) setup_dir(os.path.join(inchikey_dir, self.method_dir)) mol.write('xyz', os.path.join(inchikey_dir, self.method_dir, '%s.xyz' % self.inchikey), overwrite=True) self.log_all.info('%s 3D structure from %s added', self.inchikey, source.dirname)
def mapreduce_local(self, inchikeys, method): """Run a method's map and reduce functions locally.""" keys = {} for inchikey in inchikeys: if not is_inchikey(inchikey, enforce_standard=True): sys.exit('%s is not a valid InChIKey.' % inchikey) for key, values in method.map(inchikey, get_inchikey_dir(inchikey)): try: keys[key].append(values) except KeyError: keys[key] = [values] for key, values in keys.iteritems(): method.reduce(key, values)
def check(self): inchikey_dir = get_inchikey_dir(self.inchikey) try: mol = pybel.readfile('xyz', os.path.join(inchikey_dir, self.method_dir, '%s.xyz' % self.inchikey)).next() except IOError: return False decorate(mol, UnicodeDecorator) if not mol.write('inchikey').rstrip() == self.inchikey: self.log_console.warning('inconsistent 3D geometry in %s (%s)', self.inchikey, self.method_dir) return False return True
def mapreduce_server(self, inchikeys, method): """Start a mapreduce server.""" self.log_console.info('hostname is %s' % gethostname()) datasource = {} for inchikey in inchikeys: if not is_inchikey(inchikey, enforce_standard=True): sys.exit('%s is not a valid InChIKey.' % inchikey) datasource[inchikey] = get_inchikey_dir(inchikey) server = mapreduce.Server() server.datasource = datasource server.password = method.hash hostfile = os.path.join(os.path.dirname(__file__), '../../temp/%s.host' % server.password) with open(hostfile, 'w') as f: f.write(gethostname()) server.run() self.log_console.info('all mappers and reducers have finished')
def update_source_tsv(self, inchikey, identifier): """Update the sources.tsv file. Args: inchikey_dir: Dir to a molecule in the molecules dir. identifier: A source identifier (usually a catalog number). """ inchikey_dir = get_inchikey_dir(inchikey) name = self.name.encode('ascii', 'replace') dirname = self.dirname.encode('ascii', 'replace') identifier = identifier.encode('ascii', 'replace') sources_tsv = os.path.join(inchikey_dir, '%s.sources.tsv' % inchikey) with codecs.open(sources_tsv, 'r', 'ascii') as sources_in: with codecs.open(sources_tsv, 'a', 'ascii') as sources_out: sources_in = csv.reader(sources_in, delimiter=b'\t') sources_out = csv.writer(sources_out, delimiter=b'\t') # check if source has been recorded source_present = False for row in sources_in: try: if row[1] == dirname and row[2] == identifier: source_present = True except IndexError: pass if not source_present: if self.url_template: url_split = re.split(r"\[|\]", self.url_template) (match, replace) = re.split(r",\s?", url_split[1]) url_identifier = re.sub(match, replace, identifier) source_url = url_split[0] + url_identifier if 2 < len(url_split): source_url += url_split[2] else: source_url = '' sources_out.writerow([name, dirname, identifier, source_url.encode('ascii', 'replace')]) self.log.inchikey = inchikey self.log.info('%s added to %s sources', name, inchikey) self.log.inchikey = None
def check(self): """Check that a valid molecule folder was created and that there is a matching molecule in the database. Args: inchikey: The valid InChIKey for the molecule. inchikey_dir: The full path to the molecule's dir. Returns: True if everything is fine, False otherwise. """ inchikey_dir = get_inchikey_dir(self.inchikey) inchi = os.path.join(inchikey_dir, '%s.inchi' % self.inchikey) log = os.path.join(inchikey_dir, '%s.log' % self.inchikey) notes = os.path.join(inchikey_dir, '%s.notes' % self.inchikey) png = os.path.join(inchikey_dir, '%s.png' % self.inchikey) sources = os.path.join(inchikey_dir, '%s.sources.tsv' % self.inchikey) try: with codecs.open(inchi, encoding='utf-8') as file_: inchi_str = file_.readline().split('=')[1].strip() query = 'SELECT inchikey FROM molecule WHERE inchi=?' row = self.db.execute(query, (inchi_str,)).fetchone() try: if row.inchikey != self.inchikey: return False except AttributeError: return False with codecs.open(log, encoding='utf-8'): pass with codecs.open(notes, encoding='utf-8'): pass with codecs.open(png, encoding='utf-8'): pass with codecs.open(sources, encoding='utf-8'): pass return True except IOError: return False
def execute(self, args): """Match molecules to SMARTS patterns.""" if args.inchikeys.name == '<stdin>' and args.inchikeys.isatty(): sys.exit('No input specified.') # parse args if not (args.smarts or args.fingerprint or args.spectrophore): sys.exit('No operations were selected, nothing to match.') if sum(bool(arg) for arg in (args.smarts, args.fingerprint, args.spectrophore)) > 1: sys.exit(('One thing at a time, please. The arguments --smarts, ' '--fingerprint, and --spectrophore are mutually ' 'exclusive.')) if args.smarts and args.target: self.log_console.warning(('--target ignored, proceeding with ' 'SMARTS matching')) if args.spectrophore: if args.path is None: sys.exit(('Spectrophore calculation requires 3D geometry. ' 'You must specify a 3D geometry with --path.')) else: path = MethodPath() path.set_path(args.path) method_dir = path.get_path_directory() sp_args = {'normalization': args.spectrophore_normalization, 'accuracy': args.spectrophore_accuracy, 'stereo': args.spectrophore_stereospecificity, 'resolution': args.spectrophore_resolution} # load target and target fingerprints target_mol = None target_fp = None target_sp = None if args.target: if os.path.exists(args.target): target_mol = pybel.readfile(args.target.split('.')[-1], args.target).next() else: target_mol = pybel.readstring('smi', args.target) if target_mol is not None: if args.fingerprint: target_fp = self.calculate_fingerprint(target_mol, args.fingerprint) if args.spectrophore: target_sp = self.calculate_spectrophore(target_mol, sp_args) # match every input db = MessDB() inchi_query = 'SELECT inchi FROM molecule WHERE inchikey = ?' fp_query = ('SELECT fingerprint FROM molecule_fingerprint ' 'WHERE inchikey = ? AND name = ? ' 'AND settings = ? AND method_path_id = ?') writer = csv.writer(sys.stdout, delimiter=args.delimiter) for row in args.inchikeys: inchikey = row.split()[0].strip() if args.smarts or args.fingerprint: inchi = db.execute(inchi_query, (inchikey,)).fetchone()[0] mol = pybel.readstring('inchi', 'InChI=%s' % inchi) if args.smarts: canonical = pybel.ob.OBOp.FindType(b"canonical") canonical.Do(mol.OBMol) for (smarts_obj, smarts_str) in self.smarts_generator(args.smarts): matches = [match for match in smarts_obj.findall(mol)] if len(matches) > 0: writer.writerow([inchikey, smarts_str] + matches) if args.fingerprint: try: fp = db.execute(fp_query, (inchikey, args.fingerprint, '', '')).fetchone()[0] except TypeError: fp = self.calculate_fingerprint(mol, args.fingerprint) if target_fp is not None: similarity = self.calculate_similarity(target_fp, fp, 'tanimoto') if similarity > args.cutoff: writer.writerow([inchikey, args.fingerprint, args.target, similarity]) else: writer.writerow([inchikey, args.fingerprint] + fp) if args.spectrophore: try: sp = db.execute(fp_query, (inchikey, 'Spectrophore', json.dumps(sp_args, sort_keys=True), args.path)).fetchone()[0] except TypeError: xyz_file = os.path.join(get_inchikey_dir(inchikey), method_dir, '%s.xyz' % inchikey) mol = pybel.readfile('xyz', xyz_file).next() sp = Match.calculate_spectrophore(mol, sp_args) if target_sp is not None: try: similarity = self.calculate_similarity(target_sp, sp, 'cos') except ValueError: similarity = 0 if similarity > args.cutoff: writer.writerow([inchikey, 'Spectrophore', args.target, similarity]) else: writer.writerow([inchikey, 'Spectrophore'] + sp)
def test_get_inchikey_dir(self): molecules_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../molecules/')) test_dir = os.path.join(molecules_dir, 'B/QJ/CRHHNABKAKU-KBQPJGBKSA-N') self.assertEqual(utils.get_inchikey_dir('BQJCRHHNABKAKU-KBQPJGBKSA-N'), test_dir)
def execute(self, args): """Match molecules to SMARTS patterns.""" if args.inchikeys.name == '<stdin>' and args.inchikeys.isatty(): sys.exit('No input specified.') if not (args.cir or args.fingerprint or args.spectrophore): sys.exit('You did not request any annotations.') if args.spectrophore: if args.path is None: sys.exit(('Spectrophore calculation requires 3D geometry. ' 'You must specify a 3D geometry with --path.')) else: path = MethodPath() path.set_path(args.path) method_dir = path.get_path_directory() sp_args = {'normalization': args.spectrophore_normalization, 'accuracy': args.spectrophore_accuracy, 'stereo': args.spectrophore_stereospecificity, 'resolution': args.spectrophore_resolution} self.db = MessDB() inchi_select_query = 'SELECT inchi FROM molecule WHERE inchikey = ?' fp_select_query = ('SELECT fingerprint FROM molecule_fingerprint ' 'WHERE inchikey = ? ' 'AND name = ? ' 'AND settings = ? ' 'AND method_path_id = ?') fp_insert_query = ('INSERT INTO molecule_fingerprint ' '(inchikey, name, settings, ' 'fingerprint, method_path_id) ' 'VALUES (?, ?, ?, ?, ?)') for row in args.inchikeys: self.inchikey = row.split()[0].strip() if args.cir: self.update_iupac(self.inchikey) self.update_synonyms(self.inchikey) if args.fingerprint: inchi = self.db.execute(inchi_select_query, (self.inchikey,)).fetchone()[0] mol = pybel.readstring('inchi', 'InChI=%s' % inchi) canonical = pybel.ob.OBOp.FindType(b'canonical') canonical.Do(mol.OBMol) fp = Match.calculate_fingerprint(mol, args.fingerprint) try: db_fp = self.db.execute(fp_select_query, (self.inchikey, args.fingerprint, '', '')).fetchone()[0] if not str(fp) == db_fp: self.log_console.warning(('new %s fingerprint ' 'for %s did not match ' 'fingerprint in db, ' 'db not updated'), args.fingerprint, self.inchikey) except TypeError: self.db.execute(fp_insert_query, (self.inchikey, args.fingerprint, '', str(fp), '')) self.log_all.info('%s fingerprint for %s added to db', args.fingerprint, self.inchikey) if args.spectrophore: xyz_file = os.path.join(get_inchikey_dir(self.inchikey), method_dir, '%s.xyz' % self.inchikey) mol = pybel.readfile('xyz', xyz_file).next() sp = Match.calculate_spectrophore(mol, sp_args) try: db_sp = self.db.execute(fp_select_query, (self.inchikey, 'Spectrophore', json.dumps(sp_args, sort_keys=True), args.path)).fetchone()[0] if not str(sp) == db_sp: self.log_console.warning(('new Spectrophore ' 'fingerprint for ' '%s did not match ' 'fingerprint in db, ' 'db not updated'), self.inchikey) except TypeError: json_sp_args = json.dumps(sp_args, sort_keys=True) self.db.execute(fp_insert_query, (self.inchikey, 'Spectrophore', json_sp_args, str(sp), args.path)) self.log_all.info(('Spectrophore fingerprint for %s ' 'with parameters %s and ' 'geometry from path %i ' 'added to db'), self.inchikey, json_sp_args, args.path)