def __init__(self, S, cids, rids=None): self.S = S self.cids = cids self.rids = rids assert len(self.cids) == self.S.shape[0] if self.rids is not None: assert len(self.rids) == self.S.shape[1] self.ccache = CompoundCacher.getInstance()
def main(fname, pH, I, T): ccache = CompoundCacher.getInstance() for row in csv.reader(open(fname, 'r'), delimiter='\t'): cid = re.findall('C([0-9]+)_10', row[0])[0] cid = int(cid) dG0 = float(row[1]) comp = ccache.get_kegg_compound(cid) dG0_prime = dG0 + comp.transform_neutral(pH, I, T) print 'C%05d\t%f\t%f' % (cid, dG0, dG0_prime) ccache.dump()
def __init__(self, training_data): """ Initialize G matrix, and then use the python script "inchi2gv.py" to decompose each of the compounds that has an InChI and save the decomposition as a row in the G matrix. """ self.ccache = CompoundCacher.getInstance() self.groups_data = GroupsData.FromGroupsFile(GROUP_CSV, transformed=False) self.inchi2gv = InChI2GroupVector(self.groups_data) self.group_names = self.groups_data.GetGroupNames() self.training_data = training_data
def __init__(self): self.ccache = CompoundCacher.getInstance() base_path = os.path.split(os.path.realpath(__file__))[0] fname, weight = TrainingData.FNAME_DICT['TECRDB'] fname = os.path.join(base_path, fname) tecrdb_params = TrainingData.read_tecrdb(fname, weight) fname, weight = TrainingData.FNAME_DICT['FORMATION'] fname = os.path.join(base_path, fname) formation_params, cids_that_dont_decompose = TrainingData.read_formations(fname, weight) fname, weight = TrainingData.FNAME_DICT['REDOX'] fname = os.path.join(base_path, fname) redox_params = TrainingData.read_redox(fname, weight) thermo_params = tecrdb_params + formation_params + redox_params cids = set() for d in thermo_params: cids = cids.union(d['reaction'].keys()) cids = sorted(cids) # convert the list of reactions in sparse notation into a full # stoichiometric matrix, where the rows (compounds) are according to the # CID list 'cids'. self.S = np.zeros((len(cids), len(thermo_params))) for k, d in enumerate(thermo_params): for cid, coeff in d['reaction'].iteritems(): self.S[cids.index(cid), k] = coeff self.cids = cids self.cids_that_dont_decompose = cids_that_dont_decompose self.dG0_prime = np.array([d['dG\'0'] for d in thermo_params]) self.T = np.array([d['T'] for d in thermo_params]) self.I = np.array([d['I'] for d in thermo_params]) self.pH = np.array([d['pH'] for d in thermo_params]) self.pMg = np.array([d['pMg'] for d in thermo_params]) self.weight = np.array([d['weight'] for d in thermo_params]) rxn_inds_to_balance = [i for i in xrange(len(thermo_params)) if thermo_params[i]['balance']] self.balance_reactions(rxn_inds_to_balance) self.reverse_transform()
def __init__(self): self.ccache = CompoundCacher.getInstance() # verify that the files exist for fname, _ in TrainingData.FNAME_DICT.values(): if not os.path.exists(fname): raise Exception('file not found: ' + fname) tecrdb_params = TrainingData.read_tecrdb() formation_params, cids_that_dont_decompose = TrainingData.read_formations( ) redox_params = TrainingData.read_redox() thermo_params = tecrdb_params + formation_params + redox_params cids = set() for d in thermo_params: cids = cids.union(d['reaction'].keys()) cids = sorted(cids) # convert the list of reactions in sparse notation into a full # stoichiometric matrix, where the rows (compounds) are according to the # CID list 'cids'. self.S = np.zeros((len(cids), len(thermo_params))) for k, d in enumerate(thermo_params): for cid, coeff in d['reaction'].iteritems(): self.S[cids.index(cid), k] = coeff self.cids = cids self.cids_that_dont_decompose = cids_that_dont_decompose self.dG0_prime = np.array([d['dG\'0'] for d in thermo_params]) self.T = np.array([d['T'] for d in thermo_params]) self.I = np.array([d['I'] for d in thermo_params]) self.pH = np.array([d['pH'] for d in thermo_params]) self.pMg = np.array([d['pMg'] for d in thermo_params]) self.weight = np.array([d['weight'] for d in thermo_params]) rxn_inds_to_balance = [ i for i in xrange(len(thermo_params)) if thermo_params[i]['balance'] ] self.balance_reactions(rxn_inds_to_balance) self.reverse_transform()
def __init__(self): self.ccache = CompoundCacher.getInstance() # verify that the files exist for fname, _ in TrainingData.FNAME_DICT.values(): if not os.path.exists(fname): raise Exception('file not found: ' + fname) tecrdb_params = TrainingData.read_tecrdb() formation_params, cids_that_dont_decompose = TrainingData.read_formations() redox_params = TrainingData.read_redox() thermo_params = tecrdb_params + formation_params + redox_params cids = set() for d in thermo_params: cids = cids.union(d['reaction'].keys()) cids = sorted(cids) # convert the list of reactions in sparse notation into a full # stoichiometric matrix, where the rows (compounds) are according to the # CID list 'cids'. self.S = np.zeros((len(cids), len(thermo_params))) for k, d in enumerate(thermo_params): for cid, coeff in d['reaction'].iteritems(): self.S[cids.index(cid), k] = coeff self.cids = cids; self.cids_that_dont_decompose = cids_that_dont_decompose self.dG0_prime = np.array([d['dG\'0'] for d in thermo_params]) self.T = np.array([d['T'] for d in thermo_params]) self.I = np.array([d['I'] for d in thermo_params]) self.pH = np.array([d['pH'] for d in thermo_params]) self.pMg = np.array([d['pMg'] for d in thermo_params]) self.weight = np.array([d['weight'] for d in thermo_params]) rxn_inds_to_balance = [i for i in xrange(len(thermo_params)) if thermo_params[i]['balance']] self.balance_reactions(rxn_inds_to_balance) self.reverse_transform()
def __init__(self, training_data): """ Initialize G matrix, and then use the python script "inchi2gv.py" to decompose each of the compounds that has an InChI and save the decomposition as a row in the G matrix. """ self.ccache = CompoundCacher.getInstance() self.groups_data = init_groups_data() self.inchi2gv = InChI2GroupVector(self.groups_data) self.group_names = self.groups_data.GetGroupNames() self.train_cids = training_data.cids self.train_S = training_data.S self.train_b = np.matrix(training_data.dG0).T self.train_w = np.matrix(training_data.weight).T self.train_G = None self.train_S_joined = None self.model_S_joined = None self.params = None
def is_balanced(self): cids = list(self.keys()) coeffs = np.array([self.sparse[cid] for cid in cids], ndmin=2).T elements, Ematrix = CompoundCacher.getInstance().get_kegg_ematrix(cids) conserved = Ematrix.T * coeffs if np.any(np.isnan(conserved), 0): logging.debug('cannot test reaction balancing because of unspecific ' 'compound formulas: %s' % self.write_formula()) return True if np.any(conserved != 0, 0): logging.debug('unbalanced reaction: %s' % self.write_formula()) for j in np.where(conserved[:, 0])[0].flat: logging.debug('there are %d more %s atoms on the right-hand side' % (conserved[j, 0], elements[j])) return False return True
def is_balanced(self): cids = list(self.keys()) coeffs = np.array([self.sparse[cid] for cid in cids], ndmin=2).T elements, Ematrix = CompoundCacher.getInstance().get_kegg_ematrix(cids) conserved = Ematrix.T * coeffs if np.any(np.isnan(conserved), 0): logging.debug( 'cannot test reaction balancing because of unspecific ' 'compound formulas: %s' % self.write_formula()) return True if np.any(conserved != 0, 0): logging.debug('unbalanced reaction: %s' % self.write_formula()) for j in np.where(conserved[:, 0])[0].flat: logging.debug( 'there are %d more %s atoms on the right-hand side' % (conserved[j, 0], elements[j])) return False return True
def __init__(self, S, cids): self.S = S self.cids = cids assert len(self.cids) == self.S.shape[0] self.ccache = CompoundCacher.getInstance()
import sys, logging sys.path.append('../python') from compound import Compound from inchi2gv import init_groups_data, InChI2GroupVector, GroupDecompositionError from compound_cacher import CompoundCacher from molecule import Molecule #logger = logging.getLogger('') #logger.setLevel(logging.DEBUG) ccache = CompoundCacher.getInstance('../cache/compounds.json') groups_data = init_groups_data() group_list = groups_data.GetGroupNames() inchi2gv_converter = InChI2GroupVector(groups_data) patterns = ['c~[O;+0]', 'c~[O;+1]', 'c~[n;+1]~c', 'c~[n;+0]~c', 'c~[n;-1]~c'] for cid in [255, 1007]: comp = ccache.get_kegg_compound(cid) print "-"*50, '\nC%05d' % cid inchi = comp.inchi mol = Molecule.FromInChI(inchi) print mol.ToSmiles() print mol.FindSmarts("c~[n;+1]~c") try: groupvec = inchi2gv_converter.InChI2GroupVector(inchi) sys.stdout.write(str(groupvec) + '\n') except GroupDecompositionError as e: sys.stderr.write(str(e) + '\n') sys.stderr.write(e.GetDebugTable())