def main(fname, pH, I, T):
    ccache = CompoundCacher()
    for row in csv.reader(open(fname, 'r'), delimiter='\t'):
        compound_id = re.findall('(C[0-9]+)_10', row[0])[0]
        dG0 = float(row[1])
        comp = ccache.get_compound(compound_id)
        dG0_prime = dG0 + comp.transform_neutral(pH, I, T)
        print '%s\t%f\t%f' % (compound_id, dG0, dG0_prime)
    ccache.dump()
def main(fname, pH, I, T):
    ccache = CompoundCacher()
    for row in csv.reader(open(fname, 'r'), delimiter='\t'):
        compound_id = re.findall('(C[0-9]+)_10', row[0])[0]
        dG0 = float(row[1])
        comp = ccache.get_compound(compound_id)
        dG0_prime = dG0 + comp.transform_neutral(pH, I, T)
        print '%s\t%f\t%f' % (compound_id, dG0, dG0_prime)
    ccache.dump()
Ejemplo n.º 3
0
def get_ddG0(rxn_dict, pH, I, novel_mets):
    ccache = CompoundCacher()
    # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)
    T = 298.15
    ddG0_forward = 0
    for compound_id, coeff in rxn_dict.items():
        if novel_mets != None and compound_id in novel_mets:
            comp = novel_mets[compound_id]
        else:
            comp = ccache.get_compound(compound_id)
        ddG0_forward += coeff * comp.transform_pH7(pH, I, T)

    return ddG0_forward
Ejemplo n.º 4
0
# -*- coding: utf-8 -*-
"""
Created on Thu Aug  7 21:00:31 2014

@author: eladn
"""
import sys
from compound_cacher import CompoundCacher

compound_id = sys.argv[1]
CompoundCacher.RebuildCompoundJSON()
ccache = CompoundCacher()
sys.stderr.write('removing %s from cache ...\n' % compound_id)
ccache.remove(compound_id)
sys.stderr.write('recalculating SMILES and pKa values ...\n')
comp = ccache.get_compound(compound_id)
sys.stderr.write('writing new data to cache ...\n')
ccache.dump()

d = comp.to_json_dict()
sys.stderr.write(''.join(['%20s : %s\n' % (k, v) for (k, v) in d.iteritems()]))
class KeggModel(object):
    
    def __del__(self):
        self.ccache.dump()
    
    def __init__(self, S, cids, rids=None):
        self.S = S
        self.cids = cids
        self.rids = rids
        assert len(self.cids) == self.S.shape[0]
        if self.rids is not None:
            assert len(self.rids) == self.S.shape[1]
        self.ccache = CompoundCacher()

        # remove H+ from the stoichiometric matrix if it exists
        if 'C00080' in self.cids:
            i = self.cids.index('C00080')
            self.S = np.vstack((self.S[:i,:], self.S[i+1:,:]))
            self.cids.pop(i)
    

    @staticmethod
    def from_file(fname, arrow='<=>', format='kegg', has_reaction_ids=False):
        """
        reads a file containing reactions in KEGG format
        
        Arguments:
           fname            - the filename to read
           arrow            - the string used as the 'arrow' in each reaction (default: '<=>')
           format           - the text file format provided ('kegg', 'tsv' or 'csv')
           has_reaction_ids - a boolean flag indicating if there is a column of
                              reaction IDs (separated from the reaction with
                              whitespaces)
        
        Return a KeggModel
        """
        fd = open(fname, 'r')
        if format == 'kegg':
            model = KeggModel.from_formulas(fd.readlines(), arrow, has_reaction_ids)
        elif format == 'tsv':
            model = KeggModel.from_csv(fd, has_reaction_ids=has_reaction_ids, delimiter='\t')
        elif format == 'csv':
            model = KeggModel.from_csv(fd, has_reaction_ids=has_reaction_ids, delimiter=None)
        fd.close()
        return model
    
    @staticmethod
    def from_csv(fd, has_reaction_ids=True, delimiter=None):
        csv_reader = csv.reader(fd, delimiter=delimiter)
        if has_reaction_ids:
            rids = csv_reader.next()
            rids = rids[1:]
        else:
            rids = None
        S = []
        cids = []
        for i, row in enumerate(csv_reader):
            cids.append(row[0])
            S.append([float(x) for x in row[1:]])
        S = np.array(S)

        return KeggModel(S, cids, rids)
    
    @staticmethod
    def from_kegg_reactions(kegg_reactions, has_reaction_ids=False):
        if has_reaction_ids:
            rids = [r.rid for r in kegg_reactions]
        else:
            rids = None

        cids = set()
        for reaction in kegg_reactions:
            cids = cids.union(reaction.keys())
        
        # convert the list of reactions in sparse notation into a full
        # stoichiometric matrix, where the rows (compounds) are according to the
        # CID list 'cids'.
        cids = sorted(cids)
        S = np.matrix(np.zeros((len(cids), len(kegg_reactions))))
        for i, reaction in enumerate(kegg_reactions):
            S[:, i] = np.matrix(reaction.dense(cids))
        
        logging.debug('Successfully loaded %d reactions (involving %d unique compounds)' %
                      (S.shape[1], S.shape[0]))
        return KeggModel(S, cids, rids)
    
    @staticmethod
    def from_formulas(reaction_strings, arrow='<=>', has_reaction_ids=False,
                      raise_exception=False):
        """
        parses a list of reactions in KEGG format
        
        Arguments:
           reaction_strings - a list of reactions in KEGG format
           arrow            - the string used as the 'arrow' in each reaction (default: '<=>')
           has_reaction_ids - a boolean flag indicating if there is a column of
                              reaction IDs (separated from the reaction with
                              whitespaces)
        
        Return values:
           S     - a stoichiometric matrix
           cids  - the KEGG compound IDs in the same order as the rows of S
        """
        try:
            reactions = []
            not_balanced_count = 0
            for line in reaction_strings:
                rid = None
                if has_reaction_ids:
                    tokens = re.findall('(\w+)\s+(.*)', line.strip())[0]
                    rid = tokens[0]
                    line = tokens[1]
                try:
                    reaction = KeggReaction.parse_formula(line, arrow, rid)
                except KeggParseException as e:
                    logging.warning(str(e))
                    reaction = KeggReaction({})
                if not reaction.is_balanced(fix_water=True, raise_exception=raise_exception):
                    not_balanced_count += 1
                    logging.warning('Model contains an unbalanced reaction: ' + line)
                    reaction = KeggReaction({})
                reactions.append(reaction)
                logging.debug('Adding reaction: ' + reaction.write_formula())
            
            if not_balanced_count > 0:
                warning_str = '%d out of the %d reactions are not chemically balanced' % \
                              (not_balanced_count, len(reaction_strings))
                logging.debug(warning_str)
            return KeggModel.from_kegg_reactions(reactions, has_reaction_ids)
        
        except ValueError as e:
            if raise_exception:
                raise e
            else:
                logging.debug(str(e))
                return None

    def add_thermo(self, cc):
        # check that all CIDs in the reaction are already cached by CC
        Nc, Nr = self.S.shape
        reactions = []
        for j in xrange(Nr):
            sparse = {self.cids[i]:self.S[i,j] for i in xrange(Nc)
                      if self.S[i,j] != 0}
            reaction = KeggReaction(sparse)
            reactions.append(reaction)
            
        self.dG0, self.cov_dG0 = cc.get_dG0_r_multi(reactions)
        
    def get_transformed_dG0(self, pH, I, T):
        """
            returns the estimated dG0_prime and the standard deviation of
            each estimate (i.e. a measure for the uncertainty).
        """
        dG0_prime = self.dG0 + self._get_transform_ddG0(pH=pH, I=I, T=T)
        dG0_std = np.matrix(np.sqrt(np.diag(self.cov_dG0))).T
        U, s, V = np.linalg.svd(self.cov_dG0, full_matrices=True)
        sqrt_Sigma = np.matrix(U) * np.matrix(np.diag(s**0.5)) * np.matrix(V)
        return dG0_prime, dG0_std, sqrt_Sigma

    def _get_transform_ddG0(self, pH, I, T):
        """
        needed in order to calculate the transformed Gibbs energies of the 
        model reactions.
        
        Returns:
            an array (whose length is self.S.shape[1]) with the differences
            between DrG0_prime and DrG0. Therefore, one must add this array
            to the chemical Gibbs energies of reaction (DrG0) to get the 
            transformed values
        """
        ddG0_compounds = np.matrix(np.zeros((self.S.shape[0], 1)))
        for i, cid in enumerate(self.cids):
            comp = self.ccache.get_compound(cid)
            ddG0_compounds[i, 0] = comp.transform_pH7(pH, I, T)
        
        ddG0_forward = np.dot(self.S.T, ddG0_compounds)
        return ddG0_forward
        
    def check_S_balance(self):
        elements, Ematrix = self.ccache.get_element_matrix(self.cids)
        conserved = Ematrix.T * self.S
        rxnFil = np.any(conserved[:,range(self.S.shape[1])],axis=0)
        unbalanced_ind = np.nonzero(rxnFil)[1]
        if unbalanced_ind != []:
            logging.warning('There are (%d) unbalanced reactions in S. ' 
                            'Setting their coefficients to 0.' % 
                            len(unbalanced_ind.flat))
            if self.rids is not None:
                logging.warning('These are the unbalanced reactions: ' +
                                ', '.join([self.rids[i] for i in unbalanced_ind.flat]))
                    
            self.S[:, unbalanced_ind] = 0
        return self

    def write_reaction_by_index(self, r):
        sparse = dict([(cid, self.S[i, r]) for i, cid in enumerate(self.cids)
                       if self.S[i, r] != 0])
        if self.rids is not None:
            reaction = KeggReaction(sparse, rid=self.rids[r])
        else:
            reaction = KeggReaction(sparse)
        return reaction.write_formula()
        
    def get_unidirectional_S(self):
        S_plus = np.copy(self.S)
        S_minus = np.copy(self.S)
        S_plus[self.S < 0] = 0
        S_minus[self.S > 0] = 0
        return S_minus, S_plus
        
# -*- coding: utf-8 -*-
"""
Created on Thu Aug  7 21:00:31 2014

@author: eladn
"""
import sys
from compound_cacher import CompoundCacher

compound_id = sys.argv[1]
CompoundCacher.RebuildCompoundJSON()
ccache = CompoundCacher()
ccache.remove(compound_id)
comp = ccache.get_compound(compound_id)
ccache.dump()
Ejemplo n.º 7
0
sys.path.append('../python')
import inchi2gv
from compound_cacher import CompoundCacher
from molecule import Molecule

#logger = logging.getLogger('')
#logger.setLevel(logging.DEBUG)
ccache = CompoundCacher('../cache/compounds.json')
groups_data = inchi2gv.init_groups_data()
group_list = groups_data.GetGroupNames()
group_names = groups_data.GetGroupNames()
decomposer = inchi2gv.InChIDecomposer(groups_data)

# test the decomposition of ATP into groups
ATP_inchi = ccache.get_compound('C00002').inchi
group_def = decomposer.inchi_to_groupvec(ATP_inchi)
for j, group_name in enumerate(group_names):
    if group_def[j] != 0:
        print group_name, ' x %d' % group_def[j]

patterns = ['c~[O;+0]', 'c~[O;+1]', 'c~[n;+1]~c', 'c~[n;+0]~c', 'c~[n;-1]~c']

for cid in ['C00255', 'C01007']:
    comp = ccache.get_compound(cid)
    print "-" * 50, '\n%s' % cid
    inchi = comp.inchi
    mol = Molecule.FromInChI(inchi)
    print mol.ToSmiles()

    print mol.FindSmarts("c~[n;+1]~c")
Ejemplo n.º 8
0
# Test that numpy can be imported and its version is rather new
try:
    import numpy
    if StrictVersion(numpy.__version__) < StrictVersion('1.6.2'):
        sys.stderr.write('WARNING: your NumPy version is lower than 1.6.2 '
                         'and might not work properly. Please upgrade to '
                         'a newer version.\n')
except ImportError:
    sys.stderr.write('NumPy is not installed. Please go to http://www.numpy.org '
                     'and follow the installation instructions.\n')
    err_num += 1

try:
    from compound_cacher import CompoundCacher
    ccache = CompoundCacher()
    atp_comp = ccache.get_compound('C00002')
    assert(smiles_ATP_pH7 == atp_comp.smiles_pH7)
except AssertionError:
    sys.stderr.write('Internal Error: the SMILES string for ATP is wrong.\n')
    err_num += 1
except Exception as e:
    sys.stderr.write('Error using Compound Cacher: ' + str(e))
    err_num += 1

# Test inchi2gv.py
try:
    import inchi2gv    
    groups_data = inchi2gv.init_groups_data()
    decomposer = inchi2gv.InChIDecomposer(groups_data)
    groupvec1 = decomposer.inchi_to_groupvec(inchi_ATP)
    groupvec2 = decomposer.smiles_to_groupvec(smiles_ATP_pH7)
class ComponentContribution(object):

    def __init__(self, training_data=None):
        if training_data is None:
            training_data = TrainingData()

        self.train_cids = list(training_data.cids)
        self.cids_joined = list(training_data.cids)

        self.train_S = training_data.S
        self.model_S_joined = np.matrix(self.train_S)
        self.train_S_joined = self.model_S_joined
        
        self.train_b = np.matrix(training_data.dG0).T
        self.train_w = np.matrix(training_data.weight).T
        self.train_G = None
        self.params = None

        self.ccache = CompoundCacher()
        self.groups_data = inchi2gv.init_groups_data()
        self.decomposer = inchi2gv.InChIDecomposer(self.groups_data)
        self.group_names = self.groups_data.GetGroupNames()
        
        self.Nc = len(self.cids_joined)
        self.Ng = len(self.group_names)

    @staticmethod
    def init():
        if os.path.exists(CC_CACHE_FNAME):
            logging.debug('Loading component-contributions from cache')
            return ComponentContribution.from_matfile(CC_CACHE_FNAME)
        else:
            logging.debug('Calculating the component-contributions from raw data')
            cc = ComponentContribution()
            cc.save_matfile(CC_CACHE_FNAME)
            return cc

    def save_matfile(self, file_name):
        if self.params is None:
            self.train()

        savemat(file_name, self.params, oned_as='row')
    
    @staticmethod
    def from_matfile(file_name, training_data=None):
        cc = ComponentContribution(training_data=training_data)
        cc.params = loadmat(file_name)
        return cc
    
    def get_major_ms_dG0_f(self, compound_id):
        """
            Returns the chemical formation energy of the major MS at pH 7.
            If the compound is part of the training set, returns the value
            that was calculated during training. Otherwise, we use pure
            group contribution (if possible) on the groups of the major MS.
        """
        if compound_id is None:
            raise ValueError('given compound ID is None')
        if self.params is None:
            self.train()
        
        if compound_id in self.cids_joined:
            i = self.cids_joined.index(compound_id)
            return self.params['dG0_cc'][i, 0]
        else:
            # Decompose the compound and calculate the 'formation energy'
            # using the group contributions.
            # Note that the length of the group contribution vector we get 
            # from CC is longer than the number of groups in "groups_data" 
            # since we artifically added fictive groups to represent all the 
            # non-decomposable compounds. Therefore, we truncate the 
            # dG0_gc vector since here we only use GC for compounds which
            # are not in cids_joined anyway.
            comp = self.ccache.get_compound(compound_id)
            try:
                group_vec = self.decomposer.smiles_to_groupvec(comp.smiles_pH7)
                g = np.matrix(group_vec.ToArray())
                dG0_gc = self.params['dG0_gc'][0:self.Ng, :]
                return float(np.dot(g, dG0_gc))
            except inchi2gv.GroupDecompositionError:
                return np.nan

    def _decompose_reaction(self, reaction):
        if self.params is None:
            self.train()
        
        cids = list(self.params['cids'])
        G = self.params['G']

        # calculate the reaction stoichiometric vector and the group incidence
        # vector (x and g)
        x = np.matrix(np.zeros((self.Nc, 1)))
        x_prime = []
        G_prime = []

        for compound_id, coeff in reaction.iteritems():
            if compound_id in self.cids_joined:
                i = cids.index(compound_id)
                x[i, 0] = coeff
            else:
                # Decompose the compound and calculate the 'formation energy'
                # using the group contributions.
                # Note that the length of the group contribution vector we get 
                # from CC is longer than the number of groups in "groups_data" 
                # since we artifically added fictive groups to represent all the 
                # non-decomposable compounds. Therefore, we truncate the 
                # dG0_gc vector since here we only use GC for compounds which
                # are not in cids_joined anyway.
                x_prime.append(coeff)
                comp = self.ccache.get_compound(compound_id)
                group_vec = self.decomposer.smiles_to_groupvec(comp.smiles_pH7)
                G_prime.append(group_vec.ToArray())

        if x_prime != []:
            g = np.matrix(x_prime) * np.vstack(G_prime)
        else:
            g = np.matrix(np.zeros((1, 1)))

        g.resize((G.shape[1], 1))

        return x, g

    def get_dG0_r(self, reaction, include_analysis=False):
        """
            Arguments:
                reaction - a KeggReaction object
            
            Returns:
                the CC estimation for this reaction's untransformed dG0 (i.e.
                using the major MS at pH 7 for each of the reactants)
        """
        try:
            x, g = self._decompose_reaction(reaction)
        except inchi2gv.GroupDecompositionError:
            if not include_analysis:
                return 0, 1e5
            else:
                return 0, 1e5, []

        v_r = np.matrix(self.params['preprocess_v_r'])
        v_g = np.matrix(self.params['preprocess_v_g'])
        C1  = np.matrix(self.params['preprocess_C1'])
        C2  = np.matrix(self.params['preprocess_C2'])
        C3  = np.matrix(self.params['preprocess_C3'])

        dG0_cc = float(x.T * v_r + g.T * v_g)
        s_cc_sqr = float(x.T * C1 * x + 2 * x.T * C2 * g + g.T * C3 * g)

        if not include_analysis:
            return dG0_cc, np.sqrt(s_cc_sqr)
        else:
            # Analyse the contribution of each training observation to this 
            # reaction's dG0 estimate.
            G1 = np.matrix(self.params['preprocess_G1'])
            G2 = np.matrix(self.params['preprocess_G2'])
            G3 = np.matrix(self.params['preprocess_G3'])
            S  = np.matrix(self.params['preprocess_S'])
            S_count = np.matrix(self.params['preprocess_S_count'])
            cids = self.params['cids']
            
            # dG0_cc = (x*G1 + x*G2 + g*G3)*b
            weights_rc = (x.T * G1).round(5)
            weights_gc = (x.T * G2 + g.T * G3).round(5)
            weights = weights_rc + weights_gc
    
            orders = sorted(range(weights.shape[1]),
                            key=lambda j:abs(weights[0, j]), reverse=True)
    
            analysis = []        
            for j in orders:
                if abs(weights[0, j]) < 1e-5:
                    continue
                r = KeggReaction({cids[i]:S[i,j] for i in xrange(S.shape[0])
                                  if S[i,j] != 0})
                analysis.append({'index': j,
                                 'w_rc': weights_rc[0, j],
                                 'w_gc': weights_gc[0, j],
                                 'reaction': r,
                                 'count': int(S_count[0, j])})

            return dG0_cc, np.sqrt(s_cc_sqr), analysis

    def get_dG0_r_multi(self, reactions):
        """
            Arguments:
                reaction - a KeggReaction object
            
            Returns:
                the CC estimation for this reaction's untransformed dG0 (i.e.
                using the major MS at pH 7 for each of the reactants)
        """
        X = []
        G = []
        for reaction in reactions:
            try:
                x, g = self._decompose_reaction(reaction)
            except inchi2gv.GroupDecompositionError:
                x = np.zeros((self.Nc, 1))
                g = np.zeros((self.params['G'].shape[1], 1))
            X.append(list(x.flat))
            G.append(list(g.flat))
        X = np.matrix(X).T
        G = np.matrix(G).T
        
        v_r = np.matrix(self.params['preprocess_v_r'])
        v_g = np.matrix(self.params['preprocess_v_g'])
        C1  = np.matrix(self.params['preprocess_C1'])
        C2  = np.matrix(self.params['preprocess_C2'])
        C3  = np.matrix(self.params['preprocess_C3'])

        dG0_cc = X.T * v_r + G.T * v_g
        U = X.T * C1 * X + X.T * C2 * G + G.T * C2.T * X + G.T * C3 * G
        return dG0_cc, U
        
    def get_compound_json(self, compound_id):
        """
            adds the component-contribution estimation to the JSON
        """
        if compound_id is None:
            raise ValueError('given compound ID is None')
        if self.params is None:
            self.train()

        d = {'CID': compound_id}
        comp = self.ccache.get_compound(compound_id)
        gv = None
        
        if compound_id in self.cids_joined:
            i = self.cids_joined.index(compound_id)
            gv = self.params['G'][i, :]
            major_ms_dG0_f = self.params['dG0_cc'][i, 0]
            d['compound_index'] = i
        elif comp.smiles_pH7 is not None:
            # decompose the compounds in the training_data and add to G
            try:
                group_def = self.decomposer.smiles_to_groupvec(comp.smiles_pH7)
                gv = np.matrix(group_def.ToArray())
                # we need to truncate the dG0_gc matrix from all the group
                # dimensions that correspond to non-decomposable compounds
                # from the training set
                dG0_gc = self.params['dG0_gc'][0:self.Ng, :]
                major_ms_dG0_f = float(np.dot(gv, dG0_gc))
            except inchi2gv.GroupDecompositionError:
                d['error'] = 'We cannot estimate the formation energy of this compound ' +\
                             'because its structure is too small or too complex to ' +\
                             'decompose to groups'
                major_ms_dG0_f = np.nan
        else:
            d['error'] = 'We cannot estimate the formation energy of this compound ' +\
                         'because it has no defined structure'
            major_ms_dG0_f = np.nan

        if gv is not None:
            sparse_gv = filter(lambda x: x[1] != 0, enumerate(gv.flat))
            d['group_vector'] = sparse_gv

        if not np.isnan(major_ms_dG0_f):
            d['pmap'] = {'source': 'Component Contribution (2013)',
                         'species': list(comp.get_species(major_ms_dG0_f, default_T))}

        d['num_electrons'] = comp.atom_bag.get('e-', 0)

        if comp.inchi is not None:
            d['InChI'] = comp.inchi
            try:
                mol = Molecule.FromInChI(str(comp.inchi))
                d['mass'] = mol.GetExactMass()
                d['formula'] = mol.GetFormula()
            except OpenBabelError:
                if compound_id == 'C00282': # an exception for hydrogen
                    d['mass'] = 2.0157
                    d['formula'] = 'H2'
                else:
                    d['mass'] = 0
                    d['formula'] = ''
            
        return d
    
    def estimate_kegg_model(self, model_S, model_cids):
    
        # standardize the CID list of the training data and the model
        # and create new (larger) matrices for each one
        cids_new = [cid for cid in model_cids if cid not in self.train_cids]

        self.cids_joined += cids_new
        self.Nc = len(self.cids_joined)
                
        self.model_S_joined = ComponentContribution._zero_pad_S(
            model_S, model_cids, self.cids_joined)

        self.train_S_joined = ComponentContribution._zero_pad_S(
            self.train_S, self.train_cids, self.cids_joined)

        self.train()
        
        dG0_cc = self.params['dG0_cc']
        cov_dG0 = self.params['cov_dG0']
        MSE_kerG = self.params['MSE_kerG']
        
        model_dG0 = self.model_S_joined.T * dG0_cc
        model_cov_dG0 = self.model_S_joined.T * cov_dG0 * self.model_S_joined 

        return model_dG0, model_cov_dG0, MSE_kerG
    
    def create_group_incidence_matrix(self):
        """
            Initialize G matrix, and then use the python script "inchi2gv.py" to
            decompose each of the compounds that has an InChI and save the
            decomposition as a row in the G matrix.
        """

        G = np.zeros((self.Nc, self.Ng))
        cpd_inds_without_gv = []
        
        # decompose the compounds in the training_data and add to G
        for i, compound_id in enumerate(self.cids_joined):
            smiles_pH7 = self.ccache.get_compound(compound_id).smiles_pH7
            try:
                group_def = self.decomposer.smiles_to_groupvec(smiles_pH7)
                for j in xrange(len(self.group_names)):
                    G[i, j] = group_def[j]
            except inchi2gv.GroupDecompositionError:
                # for compounds that have no InChI or are not decomposable
                # add a unique 1 in a new column
                cpd_inds_without_gv.append(i)

        N_non_decomposable = len(cpd_inds_without_gv)
        add_G = np.zeros((self.Nc, N_non_decomposable))
        for j, i in enumerate(cpd_inds_without_gv):
            add_G[i, j] = 1
        return np.matrix(np.hstack([G, add_G]))
    
    def train(self):
        """
            Estimate standard Gibbs energies of formation
        """
        self.train_G = self.create_group_incidence_matrix()

        S = self.train_S_joined
        G = self.train_G
        b = self.train_b
        w = self.train_w
        
        m, n = S.shape
        assert G.shape[0] == m
        assert b.shape == (n, 1)
        assert w.shape == (n, 1)

        # Apply weighing
        W = np.diag(w.flat)
        GS = G.T * S

        # Linear regression for the reactant layer (aka RC)
        inv_S, r_rc, P_R_rc, P_N_rc = ComponentContribution._invert_project(S * W)

        # Linear regression for the group layer (aka GC)
        inv_GS, r_gc, P_R_gc, P_N_gc = ComponentContribution._invert_project(GS * W)

        # calculate the group contributions
        dG0_gc = inv_GS.T * W * b

        # Calculate the contributions in the stoichiometric space
        dG0_rc = inv_S.T * W * b
        dG0_cc = P_R_rc * dG0_rc + P_N_rc * G * dG0_gc

        # Calculate the residual error (unweighted squared error divided by N - rank)
        e_rc = (S.T * dG0_rc - b)
        MSE_rc = float((e_rc.T * W * e_rc) / (n - r_rc))
        # MSE_rc = (e_rc.T * e_rc) / (n - r_rc)

        e_gc = (GS.T * dG0_gc - b)
        MSE_gc = float((e_gc.T * W * e_gc) / (n - r_gc))
        # MSE_gc = (e_gc.T * e_gc) / (n - r_gc)

        # Calculate the MSE of GC residuals for all reactions in ker(G).
        # This will help later to give an estimate of the uncertainty for such
        # reactions, which otherwise would have a 0 uncertainty in the GC method.
        kerG_inds = list(np.where(np.all(GS == 0, 0))[1].flat)
        
        e_kerG = e_gc[kerG_inds]
        MSE_kerG = float((e_kerG.T * e_kerG) / len(kerG_inds))

        MSE_inf = 1e10

        # Calculate the uncertainty covariance matrices
        # [inv_S_orig, ~, ~, ~] = invertProjection(S);
        # [inv_GS_orig, ~, ~, ~] = invertProjection(GS);
        inv_SWS, _, _, _ = ComponentContribution._invert_project(S * W * S.T)
        inv_GSWGS, _, _, _ = ComponentContribution._invert_project(GS * W * GS.T)


        #V_rc  = P_R_rc * (inv_S_orig.T * W * inv_S_orig) * P_R_rc
        #V_gc  = P_N_rc * G * (inv_GS_orig.T * W * inv_GS_orig) * G' * P_N_rc
        V_rc = P_R_rc * inv_SWS * P_R_rc
        V_gc  = P_N_rc * G * inv_GSWGS * G.T * P_N_rc
        # V_rc  = P_R_rc * (inv_S_orig.T * inv_S_orig) * P_R_rc
        # V_gc  = P_N_rc * G * (inv_GS_orig.T * inv_GS_orig) * G.T * P_N_rc
        V_inf = P_N_rc * G * P_N_gc * G.T * P_N_rc

        # Calculate the total of the contributions and covariances
        cov_dG0 = V_rc * MSE_rc + V_gc * MSE_gc + V_inf * MSE_inf

        # preprocessing matrices (for calculating the contribution of each 
        # observation)
        G1 = P_R_rc * inv_S.T * W
        G2 = P_N_rc * G * inv_GS.T * W
        G3 = inv_GS.T * W
        
        S_uniq, P_col = ComponentContribution._col_uniq(S)
        S_counter = np.sum(P_col, 0)
        preprocess_G1 = G1 * P_col
        preprocess_G2 = G2 * P_col
        preprocess_G3 = G3 * P_col

        # preprocessing matrices (for quick calculation of uncertainty)
        preprocess_C1 = cov_dG0
        preprocess_C2 = MSE_gc * P_N_rc * G * inv_GSWGS + MSE_inf * G * P_N_gc
        preprocess_C3 = MSE_gc * inv_GSWGS + MSE_inf * P_N_gc

        # Put all the calculated data in 'params' for the sake of debugging
        self.params = {'b':              self.train_b,
                       'train_S':        self.train_S_joined,
                       'model_S':        self.model_S_joined,
                       'train_cids':     self.train_cids,
                       'cids':           self.cids_joined,
                       'w':              self.train_w,
                       'G':              self.train_G,
                       'dG0_rc':         dG0_rc,
                       'dG0_gc':         dG0_gc,
                       'dG0_cc':         dG0_cc,
                       'cov_dG0':        cov_dG0,
                       'V_rc':           V_rc,
                       'V_gc':           V_gc,
                       'V_inf':          V_inf,
                       'MSE_rc':         MSE_rc,
                       'MSE_gc':         MSE_gc,
                       'MSE_kerG':       MSE_kerG,
                       'MSE_inf':        MSE_inf,
                       'P_R_rc':         P_R_rc,
                       'P_R_gc':         P_R_gc,
                       'P_N_rc':         P_N_rc,
                       'P_N_gc':         P_N_gc,
                       'inv_S':          inv_S,
                       'inv_GS':         inv_GS,
                       'inv_SWS':        inv_SWS,
                       'inv_GSWGS':      inv_GSWGS,
                       'preprocess_v_r': dG0_cc,
                       'preprocess_v_g': dG0_gc,
                       'G1':             G1,
                       'G2':             G2,
                       'G3':             G3,
                       'preprocess_G1':  preprocess_G1,
                       'preprocess_G2':  preprocess_G2,
                       'preprocess_G3':  preprocess_G3,
                       'preprocess_S':   S_uniq,
                       'preprocess_S_count': S_counter,
                       'preprocess_C1':  preprocess_C1,
                       'preprocess_C2':  preprocess_C2,
                       'preprocess_C3':  preprocess_C3}

    @staticmethod
    def _zero_pad_S(S, cids_orig, cids_joined):
        """
            takes a stoichiometric matrix with a given list of IDs 'cids' and adds
            0-rows so that the list of IDs will be 'cids_joined'
        """
        if not set(cids_orig).issubset(cids_joined):
            raise Exception('The full list is missing some IDs in "cids"')
    
        full_S = np.zeros((len(cids_joined), S.shape[1]))
        for i, cid in enumerate(cids_orig):
            S_row = S[i, :]
            full_S[cids_joined.index(cid), :] = S_row
        
        return np.matrix(full_S)
        
    @staticmethod
    def _invert_project(A, eps=1e-10):
        n, m = A.shape
        U, S, V = LINALG.svd(A)
        inv_A = V * np.linalg.pinv(S) * U.T

        r = (S > eps).sum()
        P_R   = U[:, :r] * U[:, :r].T
        P_N   = U[:, r:] * U[:, r:].T

        return inv_A, r, P_R, P_N
        
    @staticmethod
    def _row_uniq(A):
        """
            A procedure usually performed before linear regression (i.e. solving Ax = y).
            If the matrix A contains repeating rows, it is advisable to combine
            all of them to one row, and the observed value corresponding to that
            row will be the average of the original observations.

            Input:
                A - a 2D NumPy array
            
            Returns:
                A_unique, P_row
                
                where A_unique has the same number of columns as A, but with
                unique rows.
                P_row is a matrix that can be used to map the original rows
                to the ones in A_unique (all values in P_row are 0 or 1).
        """
        # convert the rows of A into tuples so we can compare them
        A_tuples = [tuple(A[i,:].flat) for i in xrange(A.shape[0])]
        A_unique = list(sorted(set(A_tuples), reverse=True))

        # create the projection matrix that maps the rows in A to rows in
        # A_unique
        P_col = np.matrix(np.zeros((len(A_unique), len(A_tuples))))

        for j, tup in enumerate(A_tuples):
            # find the indices of the unique row in A_unique which correspond
            # to this original row in A (represented as 'tup')
            i = A_unique.index(tup)
            P_col[i, j] = 1
        
        return np.matrix(A_unique), P_col
    
    @staticmethod
    def _col_uniq(A):
        A_unique, P_col = ComponentContribution._row_uniq(A.T)
        return A_unique.T, P_col.T
class TrainingData(object):
    
    # a dictionary of the filenames of the training data and the relative 
    # weight of each one    
    FNAME_DICT = {'TECRDB' : ('../data/TECRDB.tsv', 1.0),
                  'FORMATION' : ('../data/formation_energies_transformed.tsv', 1.0),
                  'REDOX' : ('../data/redox.tsv', 1.0)}

    def __del__(self):
        self.ccache.dump()

    def __init__(self):
        self.ccache = CompoundCacher()
        
        thermo_params, self.cids_that_dont_decompose = TrainingData.get_all_thermo_params()
        
        cids = set()
        for d in thermo_params:
            cids = cids.union(d['reaction'].keys())
        cids = sorted(cids)
        
        # convert the list of reactions in sparse notation into a full
        # stoichiometric matrix, where the rows (compounds) are according to the
        # CID list 'cids'.
        self.S = np.zeros((len(cids), len(thermo_params)))
        for k, d in enumerate(thermo_params):
            for cid, coeff in d['reaction'].iteritems():
                self.S[cids.index(cid), k] = coeff
            
        self.cids = cids

        self.dG0_prime = np.array([d['dG\'0'] for d in thermo_params])
        self.T = np.array([d['T'] for d in thermo_params])
        self.I = np.array([d['I'] for d in thermo_params])
        self.pH = np.array([d['pH'] for d in thermo_params])
        self.pMg = np.array([d['pMg'] for d in thermo_params])
        self.weight = np.array([d['weight'] for d in thermo_params])
        self.reference = [d['reference'] for d in thermo_params]
        self.description = [d['description'] for d in thermo_params]
        rxn_inds_to_balance = [i for i in xrange(len(thermo_params))
                               if thermo_params[i]['balance']]

        self.balance_reactions(rxn_inds_to_balance)
        
        self.reverse_transform()

    def savemat(self, fname):
        d = {'dG0_prime': self.dG0_prime,
             'dG0': self.dG0,
             'T': self.T,
             'I': self.I,
             'pH': self.pH,
             'pMg': self.pMg,
             'weight': self.weight,
             'cids': self.cids}
        savemat(fname, d, oned_as='row')

    def savecsv(self, fname):
        csv_output = csv.writer(open(fname, 'w'))
        csv_output.writerow(['reaction', 'T', 'I', 'pH', 'reference', 'dG0', 'dG0_prime'])
        for j in xrange(self.S.shape[1]):
            sparse = {self.cids[i]: self.S[i, j] for i in xrange(self.S.shape[0])}
            r_string = KeggReaction(sparse).write_formula()
            csv_output.writerow([r_string, self.T[j], self.I[j], self.pH[j],
                                 self.reference[j], self.dG0[j], self.dG0_prime[j]])

    @staticmethod
    def str2double(s):
        """
            casts a string to float, but if the string is empty return NaN
        """
        if s == '':
            return np.nan
        else:
            return float(s)

    @staticmethod
    def read_tecrdb(fname, weight):
        """Read the raw data of TECRDB (NIST)"""
        thermo_params = [] # columns are: reaction, dG'0, T, I, pH, pMg, weight, balance?

        headers = ["URL", "REF_ID", "METHOD", "EVAL", "EC", "ENZYME NAME",
                   "REACTION IN KEGG IDS", "REACTION IN COMPOUND NAMES",
                   "K", "K'", "T", "I", "pH", "pMg"]

        for row_list in csv.reader(open(fname, 'r'), delimiter='\t'):
            if row_list == []:
                continue
            row = dict(zip(headers, row_list))
            if (row['K\''] == '') or (row['T'] == '') or (row['pH'] == ''):
                continue
            
            # parse the reaction
            reaction = KeggReaction.parse_formula(row['REACTION IN KEGG IDS'], arrow='=')

            # calculate dG'0
            dG0_prime = -R * TrainingData.str2double(row['T']) * \
                             np.log(TrainingData.str2double(row['K\''])) 
            try:
                thermo_params.append({'reaction': reaction,
                                      'dG\'0' : dG0_prime,
                                      'T': TrainingData.str2double(row['T']), 
                                      'I': TrainingData.str2double(row['I']),
                                      'pH': TrainingData.str2double(row['pH']),
                                      'pMg': TrainingData.str2double(row['pMg']),
                                      'weight': weight,
                                      'balance': True,
                                      'reference': row['REF_ID'],
                                      'description': row['REACTION IN COMPOUND NAMES']})
            except ValueError:
                raise Exception('Cannot parse row: ' + str(row))

        logging.debug('Successfully added %d reactions from TECRDB' % len(thermo_params))
        return thermo_params
        
    @staticmethod
    def read_formations(fname, weight):
        """Read the Formation Energy data"""
        
        # columns are: reaction, dG'0, T, I, pH, pMg, weight, balance?
        thermo_params = []
        cids_that_dont_decompose = set()
        
        # fields are: cid, name, dG'0, pH, I, pMg, T, decompose?,
        #             compound_ref, remark
        for row in csv.DictReader(open(fname, 'r'), delimiter='\t'):
            if int(row['decompose']) == 0:
                cids_that_dont_decompose.add(row['cid'])
            if row['dG\'0'] != '':
                rxn = KeggReaction({row['cid'] : 1})
                thermo_params.append({'reaction': rxn,
                                      'dG\'0' : TrainingData.str2double(row['dG\'0']),
                                      'T': TrainingData.str2double(row['T']), 
                                      'I': TrainingData.str2double(row['I']),
                                      'pH': TrainingData.str2double(row['pH']),
                                      'pMg': TrainingData.str2double(row['pMg']),
                                      'weight': weight,
                                      'balance': False,
                                      'reference': row['compound_ref'],
                                      'description': row['name'] + ' formation'})

        logging.debug('Successfully added %d formation energies' % len(thermo_params))
        return thermo_params, cids_that_dont_decompose
        
    @staticmethod
    def read_redox(fname, weight):
        """Read the Reduction potential data"""
        # columns are: reaction, dG'0, T, I, pH, pMg, weight, balance?
        thermo_params = []
        
        # fields are: name, CID_ox, nH_ox, charge_ox, CID_red,
        #             nH_red, charge_red, E'0, pH, I, pMg, T, ref
        for row in csv.DictReader(open(fname, 'r'), delimiter='\t'):
            delta_nH = TrainingData.str2double(row['nH_red']) - \
                       TrainingData.str2double(row['nH_ox'])
            delta_charge = TrainingData.str2double(row['charge_red']) - \
                           TrainingData.str2double(row['charge_ox'])
            delta_e = delta_nH - delta_charge
            dG0_prime = -F * TrainingData.str2double(row['E\'0']) * delta_e
            rxn = KeggReaction({row['CID_ox'] : -1, row['CID_red'] : 1})
            thermo_params.append({'reaction': rxn,
                                  'dG\'0' : dG0_prime,
                                  'T': TrainingData.str2double(row['T']), 
                                  'I': TrainingData.str2double(row['I']),
                                  'pH': TrainingData.str2double(row['pH']),
                                  'pMg': TrainingData.str2double(row['pMg']),
                                  'weight': weight,
                                  'balance': False,
                                  'reference': row['ref'],        
                                  'description': row['name'] + ' redox'})

        logging.debug('Successfully added %d redox potentials' % len(thermo_params))
        return thermo_params
    
    @staticmethod
    def get_all_thermo_params():
        base_path = os.path.split(os.path.realpath(__file__))[0]
    
        fname, weight = TrainingData.FNAME_DICT['TECRDB']
        fname = os.path.join(base_path, fname)
        tecrdb_params = TrainingData.read_tecrdb(fname, weight)
        
        fname, weight = TrainingData.FNAME_DICT['FORMATION']
        fname = os.path.join(base_path, fname)
        formation_params, cids_that_dont_decompose = TrainingData.read_formations(fname, weight)
        
        fname, weight = TrainingData.FNAME_DICT['REDOX']
        fname = os.path.join(base_path, fname)
        redox_params = TrainingData.read_redox(fname, weight)
        
        thermo_params = tecrdb_params + formation_params + redox_params
        return thermo_params, cids_that_dont_decompose
    
    def balance_reactions(self, rxn_inds_to_balance):
        """
            use the chemical formulas from the InChIs to verify that each and every
            reaction is balanced
        """
        elements, Ematrix = self.ccache.get_element_matrix(self.cids)
        cpd_inds_without_formula = list(np.nonzero(np.any(np.isnan(Ematrix), 1))[0].flat)
        Ematrix[np.isnan(Ematrix)] = 0

        S_without_formula = self.S[cpd_inds_without_formula, :]
        rxn_inds_without_formula = np.nonzero(np.any(S_without_formula != 0, 0))[0]
        rxn_inds_to_balance = set(rxn_inds_to_balance).difference(rxn_inds_without_formula)

        # need to check that all elements are balanced (except H, but including e-)
        # if only O is not balanced, add water molecules
        if 'O' in elements:
            i_H2O = self.cids.index('C00001')
            j_O = elements.index('O')
            conserved = np.dot(Ematrix.T, self.S)
            for k in rxn_inds_to_balance:
                self.S[i_H2O, k] = self.S[i_H2O, k] - conserved[j_O, k]

        # recalculate conservation matrix
        conserved = Ematrix.T * self.S
        
        rxn_inds_to_remove = [k for k in rxn_inds_to_balance 
                              if np.any(conserved[:, k] != 0, 0)]
        
        for k in rxn_inds_to_remove:
            sprs = {}
            for i in np.nonzero(self.S[:, k])[0]:
                sprs[self.cids[i]] = self.S[i, k]
            reaction = KeggReaction(sprs)
            logging.debug('unbalanced reaction #%d: %s' %
                          (k, reaction.write_formula()))
            for j in np.where(conserved[:, k])[0].flat:
                logging.debug('there are %d more %s atoms on the right-hand side' %
                              (conserved[j, k], elements[j]))
        
        rxn_inds_to_keep = \
            set(range(self.S.shape[1])).difference(rxn_inds_to_remove)
        
        rxn_inds_to_keep = sorted(rxn_inds_to_keep)
        
        self.S = self.S[:, rxn_inds_to_keep]
        self.dG0_prime = self.dG0_prime[rxn_inds_to_keep]
        self.T = self.T[rxn_inds_to_keep]
        self.I = self.I[rxn_inds_to_keep]
        self.pH = self.pH[rxn_inds_to_keep]
        self.pMg = self.pMg[rxn_inds_to_keep]
        self.weight = self.weight[rxn_inds_to_keep]
        self.reference = [self.reference[i] for i in rxn_inds_to_keep]
        self.description = [self.description[i] for i in rxn_inds_to_keep]

        logging.debug('After removing %d unbalanced reactions, the stoichiometric '
                      'matrix contains: '
                      '%d compounds and %d reactions' %
                      (len(rxn_inds_to_remove), self.S.shape[0], self.S.shape[1]))

    def reverse_transform(self):
        """
            Calculate the reverse transform for all reactions in training_data.
        """
        n_rxns = self.S.shape[1]
        reverse_ddG0 = np.zeros(n_rxns)
        self.I[np.isnan(self.I)] = 0.25 # default ionic strength is 0.25M
        self.pMg[np.isnan(self.pMg)] = 14 # default pMg is 14
        for i in xrange(n_rxns):
            for j in np.nonzero(self.S[:, i])[0]:
                cid = self.cids[j]
                if cid == 'C00080': # H+ should be ignored in the Legendre transform
                    continue
                comp = self.ccache.get_compound(cid)
                ddG0 = comp.transform_pH7(self.pH[i], self.I[i], self.T[i])
                reverse_ddG0[i] = reverse_ddG0[i] + ddG0 * self.S[j, i]

        self.dG0 = self.dG0_prime - reverse_ddG0
class KeggReaction(object):

    def __init__(self, sparse, arrow='<=>', rid=None):
        for cid, coeff in sparse.iteritems():
            if not (isinstance(coeff, float) or isinstance(coeff, int)):
                raise ValueError('All values in KeggReaction must be integers or floats')
        self.sparse = dict(filter(lambda (k,v):v, sparse.items()))
        self.arrow = arrow
        self.rid = rid
        self.ccache = CompoundCacher()

    def keys(self):
        return self.sparse.keys()
        
    def iteritems(self):
        return self.sparse.iteritems()

    def __str__(self):
        return self.write_formula()

    def reverse(self):
        """
            reverse the direction of the reaction by negating all stoichiometric
            coefficients
        """
        self.sparse = dict( (k, -v) for (k, v) in self.sparse.iteritems() )

    @staticmethod
    def parse_reaction_formula_side(s):
        """ 
            Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'
            Ignores stoichiometry.
            
            Returns:
                The set of CIDs.
        """
        if s.strip() == "null":
            return {}
        
        compound_bag = {}
        for member in re.split('\s+\+\s+', s):
            tokens = member.split(None, 1)
            if len(tokens) == 0:
                continue
            if len(tokens) == 1:
                amount = 1
                key = member
            else:
                try:
                    amount = float(tokens[0])
                except ValueError:
                    raise KeggParseException(
                        "Non-specific reaction: %s" % s)
                key = tokens[1]
                
            try:
                compound_bag[key] = compound_bag.get(key, 0) + amount
            except ValueError:
                raise KeggParseException(
                    "Non-specific reaction: %s" % s)
        
        return compound_bag

    @staticmethod
    def parse_formula(formula, arrow='<=>', rid=None):
        """ 
            Parses a two-sided formula such as: 2 C00001 => C00002 + C00003 
            
            Return:
                The set of substrates, products and the direction of the reaction
        """
        tokens = formula.split(arrow)
        if len(tokens) < 2:
            raise KeggParseException('Reaction does not contain the arrow sign (%s): %s'
                                     % (arrow, formula))
        if len(tokens) > 2:
            raise KeggParseException('Reaction contains more than one arrow sign (%s): %s'
                                     % (arrow, formula))
        
        left = tokens[0].strip()
        right = tokens[1].strip()
        
        sparse_reaction = {}
        for cid, count in KeggReaction.parse_reaction_formula_side(left).iteritems():
            sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count 

        for cid, count in KeggReaction.parse_reaction_formula_side(right).iteritems():
            sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count 

        return KeggReaction(sparse_reaction, arrow, rid=rid)

    @staticmethod
    def write_compound_and_coeff(compound_id, coeff):
        if coeff == 1:
            return compound_id
        else:
            return "%g %s" % (coeff, compound_id)

    def write_formula(self):
        """String representation."""
        left = []
        right = []
        for cid, coeff in sorted(self.sparse.iteritems()):
            if coeff < 0:
                left.append(KeggReaction.write_compound_and_coeff(cid, -coeff))
            elif coeff > 0:
                right.append(KeggReaction.write_compound_and_coeff(cid, coeff))
        return "%s %s %s" % (' + '.join(left), self.arrow, ' + '.join(right))

    def _get_reaction_atom_bag(self, raise_exception=False):
        """
            Use for checking if all elements are conserved.
            
            Returns:
                An atom_bag of the differences between the sides of the reaction.
                E.g. if there is one extra C on the left-hand side, the result will
                be {'C': -1}.
        """
        try:
            cids = list(self.keys())
            coeffs = map(self.sparse.__getitem__, cids)
            coeffs = np.matrix(coeffs)
    
            cached_cids = set(map(str, self.ccache.compound_id2inchi.keys()))
            if not cached_cids.issuperset(cids):
                missing_cids = set(cids).difference(cached_cids)
                warning_str = 'The following compound IDs are not in the cache, ' + \
                              'make sure they appear in kegg_additions.tsv and ' + \
                              'then run compound_cacher.py: ' + \
                              ', '.join(sorted(missing_cids))
                raise ValueError(warning_str)
        
            elements, Ematrix = self.ccache.get_element_matrix(cids)
            conserved = coeffs * Ematrix
    
            if np.any(np.isnan(conserved), 1):
                warning_str = 'cannot test reaction balancing because of unspecific ' + \
                              'compound formulas: %s' % self.write_formula()
                raise ValueError(warning_str)
            
            atom_bag = {}        
            if np.any(conserved != 0, 1):
                logging.debug('unbalanced reaction: %s' % self.write_formula())
                for j, c in enumerate(conserved.flat):
                    if c != 0:
                        logging.debug('there are %d more %s atoms on the right-hand side' %
                                      (c, elements[j]))
                        atom_bag[str(elements[j])] = c
            return atom_bag
            
        except ValueError as e:
            if raise_exception:
                raise e
            else:
                logging.debug(str(e))
                return None

    def is_balanced(self, fix_water=False, raise_exception=False):
        reaction_atom_bag = self._get_reaction_atom_bag(raise_exception)

        if reaction_atom_bag is None: # this means some compound formulas are missing
            return False

        if fix_water and 'O' in reaction_atom_bag:
            self.sparse.setdefault('C00001', 0)
            self.sparse['C00001'] += -reaction_atom_bag['O']
            if self.sparse['C00001'] == 0:
                del self.sparse['C00001']
            reaction_atom_bag = self._get_reaction_atom_bag()

        return len(reaction_atom_bag) == 0

    def is_empty(self):
        return len(self.sparse) == 0
            
    def dense(self, cids):
        s = np.matrix(np.zeros((len(cids), 1)))
        for cid, coeff in self.iteritems():
            s[cids.index(cid), 0] = coeff
        return s

    def get_transform_ddG0(self, pH, I, T):
        """
        needed in order to calculate the transformed Gibbs energies of
        reactions.
        
        Returns:
            The difference between DrG0_prime and DrG0 for this reaction.
            Therefore, this value must be added to the chemical Gibbs
            energy of reaction (DrG0) to get the transformed value.
        """
        ddG0_forward = 0
        for compound_id, coeff in self.iteritems():
            comp = self.ccache.get_compound(compound_id)
            ddG0_forward += coeff * comp.transform_pH7(pH, I, T)
        return ddG0_forward
class KeggModel(object):
    def __del__(self):
        self.ccache.dump()

    def __init__(self, S, cids, rids=None):
        self.S = S
        self.cids = cids
        self.rids = rids
        assert len(self.cids) == self.S.shape[0]
        if self.rids is not None:
            assert len(self.rids) == self.S.shape[1]
        self.ccache = CompoundCacher()

        # remove H+ from the stoichiometric matrix if it exists
        if 'C00080' in self.cids:
            i = self.cids.index('C00080')
            self.S = np.vstack((self.S[:i, :], self.S[i + 1:, :]))
            self.cids.pop(i)

    @staticmethod
    def from_file(fname, arrow='<=>', format='kegg', has_reaction_ids=False):
        """
        reads a file containing reactions in KEGG format
        
        Arguments:
           fname            - the filename to read
           arrow            - the string used as the 'arrow' in each reaction (default: '<=>')
           format           - the text file format provided ('kegg', 'tsv' or 'csv')
           has_reaction_ids - a boolean flag indicating if there is a column of
                              reaction IDs (separated from the reaction with
                              whitespaces)
        
        Return a KeggModel
        """
        fd = open(fname, 'r')
        if format == 'kegg':
            model = KeggModel.from_formulas(fd.readlines(), arrow,
                                            has_reaction_ids)
        elif format == 'tsv':
            model = KeggModel.from_csv(fd,
                                       has_reaction_ids=has_reaction_ids,
                                       delimiter='\t')
        elif format == 'csv':
            model = KeggModel.from_csv(fd,
                                       has_reaction_ids=has_reaction_ids,
                                       delimiter=None)
        fd.close()
        return model

    @staticmethod
    def from_csv(fd, has_reaction_ids=True, delimiter=None):
        csv_reader = csv.reader(fd, delimiter=delimiter)
        if has_reaction_ids:
            rids = csv_reader.next()
            rids = rids[1:]
        else:
            rids = None
        S = []
        cids = []
        for i, row in enumerate(csv_reader):
            cids.append(row[0])
            S.append([float(x) for x in row[1:]])
        S = np.array(S)

        return KeggModel(S, cids, rids)

    @staticmethod
    def from_kegg_reactions(kegg_reactions, has_reaction_ids=False):
        if has_reaction_ids:
            rids = [r.rid for r in kegg_reactions]
        else:
            rids = None

        cids = set()
        for reaction in kegg_reactions:
            cids = cids.union(reaction.keys())

        # convert the list of reactions in sparse notation into a full
        # stoichiometric matrix, where the rows (compounds) are according to the
        # CID list 'cids'.
        cids = sorted(cids)
        S = np.matrix(np.zeros((len(cids), len(kegg_reactions))))
        for i, reaction in enumerate(kegg_reactions):
            S[:, i] = np.matrix(reaction.dense(cids))

        logging.debug(
            'Successfully loaded %d reactions (involving %d unique compounds)'
            % (S.shape[1], S.shape[0]))
        return KeggModel(S, cids, rids)

    @staticmethod
    def from_formulas(reaction_strings,
                      arrow='<=>',
                      has_reaction_ids=False,
                      raise_exception=False):
        """
        parses a list of reactions in KEGG format
        
        Arguments:
           reaction_strings - a list of reactions in KEGG format
           arrow            - the string used as the 'arrow' in each reaction (default: '<=>')
           has_reaction_ids - a boolean flag indicating if there is a column of
                              reaction IDs (separated from the reaction with
                              whitespaces)
        
        Return values:
           S     - a stoichiometric matrix
           cids  - the KEGG compound IDs in the same order as the rows of S
        """
        try:
            reactions = []
            not_balanced_count = 0
            for line in reaction_strings:
                rid = None
                if has_reaction_ids:
                    tokens = re.findall('(\w+)\s+(.*)', line.strip())[0]
                    rid = tokens[0]
                    line = tokens[1]
                try:
                    reaction = KeggReaction.parse_formula(line, arrow, rid)
                except KeggParseException as e:
                    logging.warning(str(e))
                    reaction = KeggReaction({})
                if not reaction.is_balanced(fix_water=True,
                                            raise_exception=raise_exception):
                    not_balanced_count += 1
                    logging.warning('Model contains an unbalanced reaction: ' +
                                    line)
                    reaction = KeggReaction({})
                reactions.append(reaction)
                logging.debug('Adding reaction: ' + reaction.write_formula())

            if not_balanced_count > 0:
                warning_str = '%d out of the %d reactions are not chemically balanced' % \
                              (not_balanced_count, len(reaction_strings))
                logging.debug(warning_str)
            return KeggModel.from_kegg_reactions(reactions, has_reaction_ids)

        except ValueError as e:
            if raise_exception:
                raise e
            else:
                logging.debug(str(e))
                return None

    def add_thermo(self, cc):
        # check that all CIDs in the reaction are already cached by CC
        Nc, Nr = self.S.shape
        reactions = []
        for j in xrange(Nr):
            sparse = {
                self.cids[i]: self.S[i, j]
                for i in xrange(Nc) if self.S[i, j] != 0
            }
            reaction = KeggReaction(sparse)
            reactions.append(reaction)

        self.dG0, self.cov_dG0 = cc.get_dG0_r_multi(reactions)

    def get_transformed_dG0(self, pH, I, T):
        """
            returns the estimated dG0_prime and the standard deviation of
            each estimate (i.e. a measure for the uncertainty).
        """
        dG0_prime = self.dG0 + self._get_transform_ddG0(pH=pH, I=I, T=T)
        dG0_std = np.matrix(np.sqrt(np.diag(self.cov_dG0))).T
        U, s, V = np.linalg.svd(self.cov_dG0, full_matrices=True)
        sqrt_Sigma = np.matrix(U) * np.matrix(np.diag(s**0.5)) * np.matrix(V)
        return dG0_prime, dG0_std, sqrt_Sigma

    def _get_transform_ddG0(self, pH, I, T):
        """
        needed in order to calculate the transformed Gibbs energies of the 
        model reactions.
        
        Returns:
            an array (whose length is self.S.shape[1]) with the differences
            between DrG0_prime and DrG0. Therefore, one must add this array
            to the chemical Gibbs energies of reaction (DrG0) to get the 
            transformed values
        """
        ddG0_compounds = np.matrix(np.zeros((self.S.shape[0], 1)))
        for i, cid in enumerate(self.cids):
            comp = self.ccache.get_compound(cid)
            ddG0_compounds[i, 0] = comp.transform_pH7(pH, I, T)

        ddG0_forward = np.dot(self.S.T, ddG0_compounds)
        return ddG0_forward

    def check_S_balance(self, fix_water=False):
        elements, Ematrix = self.ccache.get_element_matrix(self.cids)
        conserved = Ematrix.T * self.S

        if fix_water:
            # This part only looks for imbalanced oxygen and uses extra
            # H2O molecules (on either side of the reaction equation) to
            # balance them. Keep in mind that also the e- balance is affected
            # by the water (and hydrogen is not counted at all).
            if 'C00001' not in self.cids:
                self.S = np.vstack([self.S, np.zeros((1, self.S.shape[1]))])
                self.cids.append('C00001')
                elements, Ematrix = self.ccache.get_element_matrix(self.cids)

            i_h2o = self.cids.index('C00001')
            add_water = -conserved[elements.index('O'), :]
            self.S[i_h2o, :] += add_water
            conserved += Ematrix[i_h2o, :].T * add_water

        rxnFil = np.any(conserved[:, range(self.S.shape[1])], axis=0)
        unbalanced_ind = np.nonzero(rxnFil)[1]
        if unbalanced_ind != []:
            logging.warning('There are (%d) unbalanced reactions in S. '
                            'Setting their coefficients to 0.' %
                            len(unbalanced_ind.flat))
            if self.rids is not None:
                logging.warning(
                    'These are the unbalanced reactions: ' +
                    ', '.join([self.rids[i] for i in unbalanced_ind.flat]))

            self.S[:, unbalanced_ind] = 0
        return self

    def write_reaction_by_index(self, r):
        sparse = dict([(cid, self.S[i, r]) for i, cid in enumerate(self.cids)
                       if self.S[i, r] != 0])
        if self.rids is not None:
            reaction = KeggReaction(sparse, rid=self.rids[r])
        else:
            reaction = KeggReaction(sparse)
        return reaction.write_formula()

    def get_unidirectional_S(self):
        S_plus = np.copy(self.S)
        S_minus = np.copy(self.S)
        S_plus[self.S < 0] = 0
        S_minus[self.S > 0] = 0
        return S_minus, S_plus
sys.path.append("../python")
import inchi2gv
from compound_cacher import CompoundCacher
from molecule import Molecule

# logger = logging.getLogger('')
# logger.setLevel(logging.DEBUG)
ccache = CompoundCacher("../cache/compounds.json")
groups_data = inchi2gv.init_groups_data()
group_list = groups_data.GetGroupNames()
group_names = groups_data.GetGroupNames()
decomposer = inchi2gv.InChIDecomposer(groups_data)

# test the decomposition of ATP into groups
ATP_inchi = ccache.get_compound("C00002").inchi
group_def = decomposer.inchi_to_groupvec(ATP_inchi)
for j, group_name in enumerate(group_names):
    if group_def[j] != 0:
        print group_name, " x %d" % group_def[j]


patterns = ["c~[O;+0]", "c~[O;+1]", "c~[n;+1]~c", "c~[n;+0]~c", "c~[n;-1]~c"]

for cid in ["C00255", "C01007"]:
    comp = ccache.get_compound(cid)
    print "-" * 50, "\n%s" % cid
    inchi = comp.inchi
    mol = Molecule.FromInChI(inchi)
    print mol.ToSmiles()
class KeggReaction(object):
    def __init__(self, sparse, arrow='<=>', rid=None):
        for cid, coeff in sparse.iteritems():
            if not (isinstance(coeff, float) or isinstance(coeff, int)):
                raise ValueError(
                    'All values in KeggReaction must be integers or floats')
        self.sparse = dict(filter(lambda (k, v): v, sparse.items()))
        self.arrow = arrow
        self.rid = rid
        self.ccache = CompoundCacher()

    def keys(self):
        return self.sparse.keys()

    def iteritems(self):
        return self.sparse.iteritems()

    def __str__(self):
        return self.write_formula()

    def reverse(self):
        """
            reverse the direction of the reaction by negating all stoichiometric
            coefficients
        """
        self.sparse = dict((k, -v) for (k, v) in self.sparse.iteritems())

    @staticmethod
    def parse_reaction_formula_side(s):
        """ 
            Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'
            Ignores stoichiometry.
            
            Returns:
                The set of CIDs.
        """
        if s.strip() == "null":
            return {}

        compound_bag = {}
        for member in re.split('\s+\+\s+', s):
            tokens = member.split(None, 1)
            if len(tokens) == 0:
                continue
            if len(tokens) == 1:
                amount = 1
                key = member
            else:
                try:
                    amount = float(tokens[0])
                except ValueError:
                    raise KeggParseException("Non-specific reaction: %s" % s)
                key = tokens[1]

            try:
                compound_bag[key] = compound_bag.get(key, 0) + amount
            except ValueError:
                raise KeggParseException("Non-specific reaction: %s" % s)

        return compound_bag

    @staticmethod
    def parse_formula(formula, arrow='<=>', rid=None):
        """ 
            Parses a two-sided formula such as: 2 C00001 => C00002 + C00003 
            
            Return:
                The set of substrates, products and the direction of the reaction
        """
        tokens = formula.split(arrow)
        if len(tokens) < 2:
            raise KeggParseException(
                'Reaction does not contain the arrow sign (%s): %s' %
                (arrow, formula))
        if len(tokens) > 2:
            raise KeggParseException(
                'Reaction contains more than one arrow sign (%s): %s' %
                (arrow, formula))

        left = tokens[0].strip()
        right = tokens[1].strip()

        sparse_reaction = {}
        for cid, count in KeggReaction.parse_reaction_formula_side(
                left).iteritems():
            sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count

        for cid, count in KeggReaction.parse_reaction_formula_side(
                right).iteritems():
            sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count

        return KeggReaction(sparse_reaction, arrow, rid=rid)

    @staticmethod
    def write_compound_and_coeff(compound_id, coeff):
        if coeff == 1:
            return compound_id
        else:
            return "%g %s" % (coeff, compound_id)

    def write_formula(self):
        """String representation."""
        left = []
        right = []
        for cid, coeff in sorted(self.sparse.iteritems()):
            if coeff < 0:
                left.append(KeggReaction.write_compound_and_coeff(cid, -coeff))
            elif coeff > 0:
                right.append(KeggReaction.write_compound_and_coeff(cid, coeff))
        return "%s %s %s" % (' + '.join(left), self.arrow, ' + '.join(right))

    def _get_reaction_atom_bag(self, raise_exception=False):
        """
            Use for checking if all elements are conserved.
            
            Returns:
                An atom_bag of the differences between the sides of the reaction.
                E.g. if there is one extra C on the left-hand side, the result will
                be {'C': -1}.
        """
        try:
            cids = list(self.keys())
            coeffs = map(self.sparse.__getitem__, cids)
            coeffs = np.matrix(coeffs)

            cached_cids = set(map(str, self.ccache.compound_id2inchi.keys()))
            if not cached_cids.issuperset(cids):
                missing_cids = set(cids).difference(cached_cids)
                warning_str = 'The following compound IDs are not in the cache, ' + \
                              'make sure they appear in kegg_additions.tsv and ' + \
                              'then run compound_cacher.py: ' + \
                              ', '.join(sorted(missing_cids))
                raise ValueError(warning_str)

            elements, Ematrix = self.ccache.get_element_matrix(cids)
            conserved = coeffs * Ematrix

            if np.any(np.isnan(conserved), 1):
                warning_str = 'cannot test reaction balancing because of unspecific ' + \
                              'compound formulas: %s' % self.write_formula()
                raise ValueError(warning_str)

            atom_bag = {}
            if np.any(conserved != 0, 1):
                logging.debug('unbalanced reaction: %s' % self.write_formula())
                for j, c in enumerate(conserved.flat):
                    if c != 0:
                        logging.debug(
                            'there are %d more %s atoms on the right-hand side'
                            % (c, elements[j]))
                        atom_bag[str(elements[j])] = c
            return atom_bag

        except ValueError as e:
            if raise_exception:
                raise e
            else:
                logging.debug(str(e))
                return None

    def is_balanced(self, fix_water=False, raise_exception=False):
        reaction_atom_bag = self._get_reaction_atom_bag(raise_exception)

        if reaction_atom_bag is None:  # this means some compound formulas are missing
            return False

        if fix_water and 'O' in reaction_atom_bag:
            self.sparse.setdefault('C00001', 0)
            self.sparse['C00001'] += -reaction_atom_bag['O']
            if self.sparse['C00001'] == 0:
                del self.sparse['C00001']
            reaction_atom_bag = self._get_reaction_atom_bag()

        return len(reaction_atom_bag) == 0

    def is_empty(self):
        return len(self.sparse) == 0

    def dense(self, cids):
        s = np.matrix(np.zeros((len(cids), 1)))
        for cid, coeff in self.iteritems():
            s[cids.index(cid), 0] = coeff
        return s

    def get_transform_ddG0(self, pH, I, T):
        """
        needed in order to calculate the transformed Gibbs energies of
        reactions.
        
        Returns:
            The difference between DrG0_prime and DrG0 for this reaction.
            Therefore, this value must be added to the chemical Gibbs
            energy of reaction (DrG0) to get the transformed value.
        """
        ddG0_forward = 0
        for compound_id, coeff in self.iteritems():
            comp = self.ccache.get_compound(compound_id)
            ddG0_forward += coeff * comp.transform_pH7(pH, I, T)
        return ddG0_forward