def write_reaction_by_index(self, r):
     sparse = dict([(cid, self.S[i, r]) for i, cid in enumerate(self.cids)
                    if self.S[i, r] != 0])
     if self.rids is not None:
         reaction = KeggReaction(sparse, rid=self.rids[r])
     else:
         reaction = KeggReaction(sparse)
     return reaction.write_formula()
 def write_reaction_by_index(self, r):
     sparse = dict([(cid, self.S[i, r]) for i, cid in enumerate(self.cids)
                    if self.S[i, r] != 0])
     if self.rids is not None:
         reaction = KeggReaction(sparse, rid=self.rids[r])
     else:
         reaction = KeggReaction(sparse)
     return reaction.write_formula()
    def balance_reactions(self, rxn_inds_to_balance):
        """
            use the chemical formulas from the InChIs to verify that each and every
            reaction is balanced
        """
        elements, Ematrix = self.ccache.get_element_matrix(self.cids)
        cpd_inds_without_formula = list(np.nonzero(np.any(np.isnan(Ematrix), 1))[0].flat)
        Ematrix[np.isnan(Ematrix)] = 0

        S_without_formula = self.S[cpd_inds_without_formula, :]
        rxn_inds_without_formula = np.nonzero(np.any(S_without_formula != 0, 0))[0]
        rxn_inds_to_balance = set(rxn_inds_to_balance).difference(rxn_inds_without_formula)

        # need to check that all elements are balanced (except H, but including e-)
        # if only O is not balanced, add water molecules
        if 'O' in elements:
            i_H2O = self.cids.index('C00001')
            j_O = elements.index('O')
            conserved = np.dot(Ematrix.T, self.S)
            for k in rxn_inds_to_balance:
                self.S[i_H2O, k] = self.S[i_H2O, k] - conserved[j_O, k]

        # recalculate conservation matrix
        conserved = Ematrix.T * self.S
        
        rxn_inds_to_remove = [k for k in rxn_inds_to_balance 
                              if np.any(conserved[:, k] != 0, 0)]
        
        for k in rxn_inds_to_remove:
            sprs = {}
            for i in np.nonzero(self.S[:, k])[0]:
                sprs[self.cids[i]] = self.S[i, k]
            reaction = KeggReaction(sprs)
            logging.debug('unbalanced reaction #%d: %s' %
                          (k, reaction.write_formula()))
            for j in np.where(conserved[:, k])[0].flat:
                logging.debug('there are %d more %s atoms on the right-hand side' %
                              (conserved[j, k], elements[j]))
        
        rxn_inds_to_keep = \
            set(range(self.S.shape[1])).difference(rxn_inds_to_remove)
        
        rxn_inds_to_keep = sorted(rxn_inds_to_keep)
        
        self.S = self.S[:, rxn_inds_to_keep]
        self.dG0_prime = self.dG0_prime[rxn_inds_to_keep]
        self.T = self.T[rxn_inds_to_keep]
        self.I = self.I[rxn_inds_to_keep]
        self.pH = self.pH[rxn_inds_to_keep]
        self.pMg = self.pMg[rxn_inds_to_keep]
        self.weight = self.weight[rxn_inds_to_keep]
        self.reference = [self.reference[i] for i in rxn_inds_to_keep]
        self.description = [self.description[i] for i in rxn_inds_to_keep]

        logging.debug('After removing %d unbalanced reactions, the stoichiometric '
                      'matrix contains: '
                      '%d compounds and %d reactions' %
                      (len(rxn_inds_to_remove), self.S.shape[0], self.S.shape[1]))
 def from_formulas(reaction_strings, arrow='<=>', has_reaction_ids=False,
                   raise_exception=False):
     """
     parses a list of reactions in KEGG format
     
     Arguments:
        reaction_strings - a list of reactions in KEGG format
        arrow            - the string used as the 'arrow' in each reaction (default: '<=>')
        has_reaction_ids - a boolean flag indicating if there is a column of
                           reaction IDs (separated from the reaction with
                           whitespaces)
     
     Return values:
        S     - a stoichiometric matrix
        cids  - the KEGG compound IDs in the same order as the rows of S
     """
     try:
         reactions = []
         not_balanced_count = 0
         for line in reaction_strings:
             rid = None
             if has_reaction_ids:
                 tokens = re.findall('(\w+)\s+(.*)', line.strip())[0]
                 rid = tokens[0]
                 line = tokens[1]
             try:
                 reaction = KeggReaction.parse_formula(line, arrow, rid)
             except KeggParseException as e:
                 logging.warning(str(e))
                 reaction = KeggReaction({})
             if not reaction.is_balanced(fix_water=True, raise_exception=raise_exception):
                 not_balanced_count += 1
                 logging.warning('Model contains an unbalanced reaction: ' + line)
                 reaction = KeggReaction({})
             reactions.append(reaction)
             logging.debug('Adding reaction: ' + reaction.write_formula())
         
         if not_balanced_count > 0:
             warning_str = '%d out of the %d reactions are not chemically balanced' % \
                           (not_balanced_count, len(reaction_strings))
             logging.debug(warning_str)
         return KeggModel.from_kegg_reactions(reactions, has_reaction_ids)
     
     except ValueError as e:
         if raise_exception:
             raise e
         else:
             logging.debug(str(e))
             return None
    def read_tecrdb(fname, weight):
        """Read the raw data of TECRDB (NIST)"""
        thermo_params = [] # columns are: reaction, dG'0, T, I, pH, pMg, weight, balance?

        headers = ["URL", "REF_ID", "METHOD", "EVAL", "EC", "ENZYME NAME",
                   "REACTION IN KEGG IDS", "REACTION IN COMPOUND NAMES",
                   "K", "K'", "T", "I", "pH", "pMg"]

        for row_list in csv.reader(open(fname, 'r'), delimiter='\t'):
            if row_list == []:
                continue
            row = dict(zip(headers, row_list))
            if (row['K\''] == '') or (row['T'] == '') or (row['pH'] == ''):
                continue
            
            # parse the reaction
            reaction = KeggReaction.parse_formula(row['REACTION IN KEGG IDS'], arrow='=')

            # calculate dG'0
            dG0_prime = -R * TrainingData.str2double(row['T']) * \
                             np.log(TrainingData.str2double(row['K\''])) 
            try:
                thermo_params.append({'reaction': reaction,
                                      'dG\'0' : dG0_prime,
                                      'T': TrainingData.str2double(row['T']), 
                                      'I': TrainingData.str2double(row['I']),
                                      'pH': TrainingData.str2double(row['pH']),
                                      'pMg': TrainingData.str2double(row['pMg']),
                                      'weight': weight,
                                      'balance': True})
            except ValueError:
                raise Exception('Cannot parse row: ' + str(row))

        logging.info('Successfully added %d reactions from TECRDB' % len(thermo_params))
        return thermo_params
    def add_thermo(self, cc):
        # check that all CIDs in the reaction are already cached by CC
        Nc, Nr = self.S.shape
        reactions = []
        for j in xrange(Nr):
            sparse = {
                self.cids[i]: self.S[i, j]
                for i in xrange(Nc) if self.S[i, j] != 0
            }
            reaction = KeggReaction(sparse)
            reactions.append(reaction)

        self.dG0, self.cov_dG0 = cc.get_dG0_r_multi(reactions)
    def from_formulas(reaction_strings,
                      arrow='<=>',
                      has_reaction_ids=False,
                      raise_exception=False):
        """
        parses a list of reactions in KEGG format
        
        Arguments:
           reaction_strings - a list of reactions in KEGG format
           arrow            - the string used as the 'arrow' in each reaction (default: '<=>')
           has_reaction_ids - a boolean flag indicating if there is a column of
                              reaction IDs (separated from the reaction with
                              whitespaces)
        
        Return values:
           S     - a stoichiometric matrix
           cids  - the KEGG compound IDs in the same order as the rows of S
        """
        try:
            reactions = []
            not_balanced_count = 0
            for line in reaction_strings:
                rid = None
                if has_reaction_ids:
                    tokens = re.findall('(\w+)\s+(.*)', line.strip())[0]
                    rid = tokens[0]
                    line = tokens[1]
                try:
                    reaction = KeggReaction.parse_formula(line, arrow, rid)
                except KeggParseException as e:
                    logging.warning(str(e))
                    reaction = KeggReaction({})
                if not reaction.is_balanced(fix_water=True,
                                            raise_exception=raise_exception):
                    not_balanced_count += 1
                    logging.warning('Model contains an unbalanced reaction: ' +
                                    line)
                    reaction = KeggReaction({})
                reactions.append(reaction)
                logging.debug('Adding reaction: ' + reaction.write_formula())

            if not_balanced_count > 0:
                warning_str = '%d out of the %d reactions are not chemically balanced' % \
                              (not_balanced_count, len(reaction_strings))
                logging.debug(warning_str)
            return KeggModel.from_kegg_reactions(reactions, has_reaction_ids)

        except ValueError as e:
            if raise_exception:
                raise e
            else:
                logging.debug(str(e))
                return None
Ejemplo n.º 8
0
    def read_tecrdb():
        """Read the raw data of TECRDB (NIST)"""
        fname, weight = TrainingData.FNAME_DICT['TECRDB']

        thermo_params = [
        ]  # columns are: reaction, dG'0, T, I, pH, pMg, weight, balance?

        headers = [
            "URL", "REF_ID", "METHOD", "EVAL", "EC", "ENZYME NAME",
            "REACTION IN KEGG IDS", "REACTION IN COMPOUND NAMES", "K", "K'",
            "T", "I", "pH", "pMg"
        ]

        for row_list in csv.reader(open(fname, 'r'), delimiter='\t'):
            if row_list == []:
                continue
            row = dict(zip(headers, row_list))
            if (row['K\''] == '') or (row['T'] == '') or (row['pH'] == ''):
                continue

            # parse the reaction
            reaction = KeggReaction.parse_formula(row['REACTION IN KEGG IDS'],
                                                  arrow='=')

            # calculate dG'0
            dG0_prime = -R * TrainingData.str2double(row['T']) * \
                             np.log(TrainingData.str2double(row['K\'']))
            try:
                thermo_params.append({
                    'reaction': reaction,
                    'dG\'0': dG0_prime,
                    'T': TrainingData.str2double(row['T']),
                    'I': TrainingData.str2double(row['I']),
                    'pH': TrainingData.str2double(row['pH']),
                    'pMg': TrainingData.str2double(row['pMg']),
                    'weight': weight,
                    'balance': True
                })
            except ValueError:
                raise Exception('Cannot parse row: ' + str(row))

        logging.info('Successfully added %d reactions from TECRDB' %
                     len(thermo_params))
        return thermo_params
Ejemplo n.º 9
0
    def read_redox():
        """Read the Reduction potential data"""

        fname, weight = TrainingData.FNAME_DICT['REDOX']
        # columns are: reaction, dG'0, T, I, pH, pMg, weight, balance?
        thermo_params = []

        # fields are: name, CID_ox, nH_ox, charge_ox, CID_red,
        #             nH_red, charge_red, E'0, pH, I, pMg, T, ref
        for row in csv.DictReader(open(fname, 'r'), delimiter='\t'):
            cid_ox = int(row['CID_ox'])
            cid_red = int(row['CID_red'])
            delta_nH = TrainingData.str2double(row['nH_red']) - \
                       TrainingData.str2double(row['nH_ox'])
            delta_charge = TrainingData.str2double(row['charge_red']) - \
                           TrainingData.str2double(row['charge_ox'])
            delta_e = delta_nH - delta_charge
            dG0_prime = -F * TrainingData.str2double(row['E\'0']) * delta_e

            thermo_params.append({
                'reaction':
                KeggReaction({
                    cid_ox: -1,
                    cid_red: 1
                }),
                'dG\'0':
                dG0_prime,
                'T':
                TrainingData.str2double(row['T']),
                'I':
                TrainingData.str2double(row['I']),
                'pH':
                TrainingData.str2double(row['pH']),
                'pMg':
                TrainingData.str2double(row['pMg']),
                'weight':
                weight,
                'balance':
                False
            })

        logging.info('Successfully added %d redox potentials' %
                     len(thermo_params))
        return thermo_params
Ejemplo n.º 10
0
 def from_formulas(reaction_strings, arrow='<=>', has_reaction_ids=False):
     """
     parses a list of reactions in KEGG format
     
     Arguments:
        reaction_strings - a list of reactions in KEGG format
        arrow            - the string used as the 'arrow' in each reaction (default: '<=>')
        has_reaction_ids - a boolean flag indicating if there is a column of
                           reaction IDs (separated from the reaction with
                           whitespaces)
     
     Return values:
        S     - a stoichiometric matrix
        cids  - the KEGG compound IDs in the same order as the rows of S
     """
     
     cids = set()
     if has_reaction_ids:
         rids = []
     else:
         rids = None
     reactions = []
     for line in reaction_strings:
         if has_reaction_ids:
             tokens = re.split('(\w+)\s+(.*)', line, maxsplit=1)
             rids.append(tokens[0])
             line = tokens[1]
         reaction = KeggReaction.parse_formula(line, arrow)
         if not reaction.is_balanced():
             raise ValueError('Model contains unbalanced reactions')
         cids = cids.union(reaction.keys())
         reactions.append(reaction)
     
     # convert the list of reactions in sparse notation into a full
     # stoichiometric matrix, where the rows (compounds) are according to the
     # CID list 'cids'.
     cids = sorted(cids)
     S = np.zeros((len(cids), len(reactions)))
     for i, reaction in enumerate(reactions):
         for cid, coeff in reaction.iteritems():
             S[cids.index(cid), i] = coeff
             
     return KeggModel(S, cids, rids)
Ejemplo n.º 11
0
    def from_formulas(reaction_strings, arrow='<=>', has_reaction_ids=False):
        """
        parses a list of reactions in KEGG format
        
        Arguments:
           reaction_strings - a list of reactions in KEGG format
           arrow            - the string used as the 'arrow' in each reaction (default: '<=>')
           has_reaction_ids - a boolean flag indicating if there is a column of
                              reaction IDs (separated from the reaction with
                              whitespaces)
        
        Return values:
           S     - a stoichiometric matrix
           cids  - the KEGG compound IDs in the same order as the rows of S
        """

        cids = set()
        reactions = []
        for line in reaction_strings:
            if has_reaction_ids:
                tokens = re.split('(\w+)\s+(.*)', line, maxsplit=1)
                line = tokens[1]
            reaction = KeggReaction.parse_formula(line, arrow)
            if not reaction.is_balanced():
                raise ValueError('Model contains unbalanced reactions')
            cids = cids.union(reaction.keys())
            reactions.append(reaction)

        # convert the list of reactions in sparse notation into a full
        # stoichiometric matrix, where the rows (compounds) are according to the
        # CID list 'cids'.
        cids = sorted(cids)
        S = np.zeros((len(cids), len(reactions)))
        for i, reaction in enumerate(reactions):
            for cid, coeff in reaction.iteritems():
                S[cids.index(cid), i] = coeff

        return KeggModel(S, cids)
Ejemplo n.º 12
0
    def read_formations():
        """Read the Formation Energy data"""
        fname, weight = TrainingData.FNAME_DICT['FORMATION']
        # columns are: reaction, dG'0, T, I, pH, pMg, weight, balance?
        thermo_params = []
        cids_that_dont_decompose = set()

        # fields are: cid, name, dG'0, pH, I, pMg, T, decompose?,
        #             compound_ref, remark
        for row in csv.DictReader(open(fname, 'r'), delimiter='\t'):
            cid = int(row['cid'])
            if int(row['decompose']) == 0:
                cids_that_dont_decompose.add(cid)
            if row['dG\'0'] != '':
                thermo_params.append({
                    'reaction':
                    KeggReaction({cid: 1}),
                    'dG\'0':
                    TrainingData.str2double(row['dG\'0']),
                    'T':
                    TrainingData.str2double(row['T']),
                    'I':
                    TrainingData.str2double(row['I']),
                    'pH':
                    TrainingData.str2double(row['pH']),
                    'pMg':
                    TrainingData.str2double(row['pMg']),
                    'weight':
                    weight,
                    'balance':
                    False
                })

        logging.info('Successfully added %d formation energies' %
                     len(thermo_params))
        return thermo_params, cids_that_dont_decompose
Ejemplo n.º 13
0
    def balance_reactions(self, rxn_inds_to_balance):
        """
            use the chemical formulas from the InChIs to verify that each and every
            reaction is balanced
        """
        elements, Ematrix = self.ccache.get_kegg_ematrix(self.cids)
        cpd_inds_without_formula = list(
            np.nonzero(np.any(np.isnan(Ematrix), 1))[0].flat)
        Ematrix[np.isnan(Ematrix)] = 0

        S_without_formula = self.S[cpd_inds_without_formula, :]
        rxn_inds_without_formula = np.nonzero(np.any(S_without_formula != 0,
                                                     0))[0]
        rxn_inds_to_balance = set(rxn_inds_to_balance).difference(
            rxn_inds_without_formula)

        # need to check that all elements are balanced (except H, but including e-)
        # if only O is not balanced, add water molecules
        if 'O' in elements:
            i_H2O = self.cids.index(1)
            j_O = elements.index('O')
            conserved = np.dot(Ematrix.T, self.S)
            for k in rxn_inds_to_balance:
                self.S[i_H2O, k] = self.S[i_H2O, k] - conserved[j_O, k]

        # recalculate conservation matrix
        conserved = Ematrix.T * self.S

        rxn_inds_to_remove = [
            k for k in rxn_inds_to_balance if np.any(conserved[:, k] != 0, 0)
        ]

        for k in rxn_inds_to_remove:
            sprs = {}
            for i in np.nonzero(self.S[:, k])[0]:
                sprs[self.cids[i]] = self.S[i, k]
            reaction = KeggReaction(sprs)
            logging.debug('unbalanced reaction #%d: %s' %
                          (k, reaction.write_formula()))
            for j in np.where(conserved[:, k])[0].flat:
                logging.debug(
                    'there are %d more %s atoms on the right-hand side' %
                    (conserved[j, k], elements[j]))

        rxn_inds_to_keep = \
            set(range(self.S.shape[1])).difference(rxn_inds_to_remove)

        rxn_inds_to_keep = sorted(rxn_inds_to_keep)

        self.S = self.S[:, rxn_inds_to_keep]
        self.dG0_prime = self.dG0_prime[:, rxn_inds_to_keep]
        self.T = self.T[:, rxn_inds_to_keep]
        self.I = self.I[:, rxn_inds_to_keep]
        self.pH = self.pH[:, rxn_inds_to_keep]
        self.pMg = self.pMg[:, rxn_inds_to_keep]
        self.weight = self.weight[:, rxn_inds_to_keep]

        logging.info(
            'After removing %d unbalanced reactions, the stoichiometric '
            'matrix contains: '
            '%d compounds and %d reactions' %
            (len(rxn_inds_to_remove), self.S.shape[0], self.S.shape[1]))