def MatchCRCDataToKEGG():
    """
        Reads the raw data collected from the CRC handbook and tries to map every
        compound name there to a KEGG compound ID.
        Then it writes the results to the 'Public' Database
    """

    public_db = SqliteDatabase('../data/public_data.sqlite')
    kegg = Kegg.getInstance()

    cas2cid = {}
    for cid, comp in kegg.cid2compound_map.iteritems():
        if comp.cas:
            cas2cid[comp.cas] = cid

    public_db.CreateTable('pKa_from_CRC', [
        'cas TEXT', 'cid INT', 'name TEXT', 'formula TEXT', 'T REAL',
        'pKa REAL'
    ])
    for row_dict in csv.DictReader(
            open('../data/thermodynamics/pKa_from_CRC.csv', 'r')):
        cas = row_dict['CAS']
        name = row_dict['name']
        formula = row_dict['formula']
        if row_dict['T']:
            T = 273.15 + float(row_dict['T'])
        else:
            T = None

        if row_dict['pKa'][0] == '~':
            pKa = float(row_dict['pKa'][1:])
        else:
            pKa = float(row_dict['pKa'])

        if cas not in cas2cid:
            logging.warning('Cannot find this CAS number (%s, %s) in KEGG' %
                            (cas, name))
            cid = None
        else:
            cid = cas2cid[cas]
            name = kegg.cid2name(cid)
        public_db.Insert(
            'pKa_from_CRC',
            [unicode(cas), cid,
             unicode(name),
             unicode(formula), T, pKa])
    public_db.Commit()
Beispiel #2
0
class KeggGenes(object):
    
    def __init__(self, html_fname):
        self.serv = None
        self.db = SqliteDatabase('channeling/channeling.sqlite', 'w')
        self.html_writer = HtmlWriter(html_fname)
        
        self.COMPOUND_TABLE_NAME = 'kegg_compounds'
        self.GENE_TABLE_NAME = 'kegg_genes'
        self.GENE_REACTION_TABLE_NAME = 'kegg_genes_to_reactions'
        self.REACTION_TABLE_NAME = 'kegg_reactions'
        self.EQUATION_TABLE_NAME = 'kegg_equations'
        self.STOICHIOMETRY_TABLE_NAME = 'kegg_stoichiometry'
        self.GIBBS_ENERGY_TABLE_NAME = 'kegg_gibbs_energies'
        self.GENE_ENERGY_TABLE_NAME = 'kegg_gene_energies'
        self.FUNCTIONAL_INTERATCTIONS_TABLE = 'parkinson_functional_interactions'
        self.GENE_PAIRS_TABLE_NAME = 'kegg_gene_pairs'
        self.COFACTOR_TABLE_NAME = 'kegg_cofactors'

    def GetAllCompounds(self):
        self.db.CreateTable(self.COMPOUND_TABLE_NAME, "compound INT, name TEXT, all_names TEXT", drop_if_exists=True)
        self.db.CreateIndex('compound_idx', self.COMPOUND_TABLE_NAME, 'compound', unique=True, drop_if_exists=True)

        f = urllib.urlopen('http://rest.kegg.jp/list/cpd/')
        for row in f.read().split('\n'):
            if row.strip() == '':
                continue
            if row.find('\t') != -1:
                compound, all_names = row.split('\t', 1)
            else:
                raise ValueError('Bad compound name: ' + row)
            name = all_names.split(';')[0]
            self.db.Insert(self.COMPOUND_TABLE_NAME, [compound, name, all_names])
        self.db.Commit()
    
    def GetAllGenes(self, organism='eco'):
        self.db.CreateTable(self.GENE_TABLE_NAME, ['organism', 'gene', 'desc'], drop_if_exists=False)
        self.db.CreateIndex('gene_idx', self.GENE_TABLE_NAME, 'gene', unique=False, drop_if_exists=False)

        self.db.Execute("DELETE FROM %s WHERE organism = '%s'" % 
                        (self.GENE_TABLE_NAME, organism))

        f = urllib.urlopen('http://rest.kegg.jp/list/%s/' % organism)
        for row in f.read().split('\n'):
            if row.strip() == '':
                continue
            gene, desc = row.split('\t')
            self.db.Insert(self.GENE_TABLE_NAME, [organism, gene, desc])
        self.db.Commit()
    
    def GetAllReactions(self, organism='eco'):
        self.db.CreateTable(self.GENE_REACTION_TABLE_NAME, ['organism', 'gene', 'reaction'], drop_if_exists=False)
        self.db.CreateIndex('reaction_gene_idx', self.GENE_REACTION_TABLE_NAME, 'gene', unique=False, drop_if_exists=False)
        self.db.CreateIndex('reaction_idx', self.GENE_REACTION_TABLE_NAME, 'reaction', unique=False, drop_if_exists=False)

        self.db.Execute("DELETE FROM %s WHERE organism = '%s'" % 
                        (self.GENE_REACTION_TABLE_NAME, organism))

        f = urllib.urlopen('http://rest.kegg.jp/link/rn/%s' % organism)
        for row in f.read().split('\n'):
            if row.strip() == '':
                continue
            gene, reaction = row.split('\t')
            self.db.Insert(self.GENE_REACTION_TABLE_NAME, [organism, gene, reaction])
        self.db.Commit()        
                
    def GetAllEquations(self):
        self.db.CreateTable(self.EQUATION_TABLE_NAME, ['reaction', 'equation'], drop_if_exists=True)
        self.db.CreateIndex('equation_reaction_idx', self.EQUATION_TABLE_NAME, 'reaction', unique=False, drop_if_exists=True)
        self.db.CreateIndex('equation_idx', self.EQUATION_TABLE_NAME, 'equation', unique=False, drop_if_exists=True)

        all_reactions = []
        for row in self.db.Execute("SELECT distinct(reaction) FROM %s" % 
                                   (self.GENE_REACTION_TABLE_NAME)):
            all_reactions.append(str(row[0]))
        
        for reaction in all_reactions:
            f = urllib.urlopen('http://rest.kegg.jp/get/%s' % reaction)
            for equation in self._ReadReactionEntries(f.read()):
                self.db.Insert(self.EQUATION_TABLE_NAME,
                    [reaction, equation])
                sys.stderr.write('Equation for reaction %s: %s\n' % (reaction, equation))
        self.db.Commit()

    def _ReadReactionEntries(self, s):
        equation_list = []
        entry2fields_map = kegg_parser.ParsedKeggFile.FromKeggAPI(s)
        for key in sorted(entry2fields_map.keys()):
            field_map = entry2fields_map[key]
            if "EQUATION" in field_map:
                equation_list.append(field_map["EQUATION"])
        
        return equation_list
    
    def GetStoichiometries(self):
        self.db.CreateTable(self.STOICHIOMETRY_TABLE_NAME, "equation TEXT, compound TEXT, coefficient REAL", drop_if_exists=True)
        self.db.CreateIndex('stoichiometry_equation_idx', self.STOICHIOMETRY_TABLE_NAME, 'equation', unique=False, drop_if_exists=True)
        self.db.CreateIndex('stoichiometry_compound_idx', self.STOICHIOMETRY_TABLE_NAME, 'compound', unique=False, drop_if_exists=True)

        all_kegg_reactions = []
        all_equations = []
        for row in self.db.Execute("SELECT distinct(equation) FROM %s" % 
                                   (self.EQUATION_TABLE_NAME)):
            try:
                r = Reaction.FromFormula(str(row[0]))
                all_equations.append(str(row[0]))
                all_kegg_reactions.append(r)
            except (KeggParseException, KeggNonCompoundException):
                pass
        
        for i, equation in enumerate(all_equations):
            for compound, coefficient in all_kegg_reactions[i].iteritems():
                self.db.Insert(self.STOICHIOMETRY_TABLE_NAME,
                               [equation, "cpd:C%05d" % compound, coefficient])
    
        self.db.Commit()

    def GetForamtionEnergies(self, thermo):
        self.db.CreateTable(self.GIBBS_ENERGY_TABLE_NAME, "equation TEXT, dG0 REAL, dGc REAL", drop_if_exists=True)
        self.db.CreateIndex('gibbs_equation_idx', self.GIBBS_ENERGY_TABLE_NAME, 'equation', unique=True, drop_if_exists=True)

        all_equations = set()
        for row in self.db.Execute("SELECT distinct(equation) FROM %s" % 
                                   (self.EQUATION_TABLE_NAME)):
            all_equations.add(str(row[0]))
        
        from pygibbs.kegg import Kegg
        kegg = Kegg.getInstance()
        all_kegg_cids = set(kegg.get_all_cids())
        for equation in all_equations:
            try:
                rxn = Reaction.FromFormula(equation)
                if not rxn.get_cids().issubset(all_kegg_cids):
                    raise KeggNonCompoundException
                rxn.Balance(balance_water=True, exception_if_unknown=True)
                dG0 = thermo.GetTransfromedKeggReactionEnergies([rxn], conc=1)[0, 0]
                dGc = thermo.GetTransfromedKeggReactionEnergies([rxn], conc=1e-3)[0, 0]
                self.db.Insert(self.GIBBS_ENERGY_TABLE_NAME, [equation, dG0, dGc])
                
            except (KeggParseException, KeggNonCompoundException, KeggReactionNotBalancedException):
                self.db.Insert(self.GIBBS_ENERGY_TABLE_NAME, [equation, None, None])
    
        self.db.Commit()
    
    def LoadCofactors(self):
        self.db.CreateTable(self.COFACTOR_TABLE_NAME,
                            'compound TEXT, name TEXT, c_min REAL, c_max REAL, ref TEXT',
                            drop_if_exists=True)
        self.db.CreateIndex('cofactor_idx', self.COFACTOR_TABLE_NAME,
                            'compound', unique=True, drop_if_exists=True)

        csv_reader = csv.DictReader(open('channeling/cofactors.csv', 'r'))
        for rowdict in csv_reader:
            self.db.Insert(self.COFACTOR_TABLE_NAME,
                           ["cpd:C%05d" % int(rowdict['cid']), rowdict['name'],
                            float(rowdict['c_min'] or np.nan), float(rowdict['c_max'] or np.nan),
                            rowdict['ref']])
        self.db.Commit()
        
    def CreateGeneEnergyTable(self):
        self.db.CreateTable(self.GENE_ENERGY_TABLE_NAME,
                            "gene TEXT, reaction TEXT, dGc REAL, compound INT, coefficient REAL",
                            drop_if_exists=True)
        self.db.CreateIndex('gene_energy_compound_idx',
                            self.GENE_ENERGY_TABLE_NAME, 'compound', unique=False)
        self.db.CreateIndex('gene_energy_gene_idx',
                            self.GENE_ENERGY_TABLE_NAME, 'gene', unique=False)

        query = """
            INSERT INTO %s (gene, reaction, dGc, compound, coefficient)
                SELECT  gen.gene, rxn.reaction, eng.dGc, sto.compound, sto.coefficient
                FROM    kegg_genes gen, kegg_genes_to_reactions rxn,
                        kegg_equations eqn, kegg_gibbs_energies eng,
                        kegg_stoichiometry sto
                WHERE   gen.organism = 'eco'
                AND     gen.gene = rxn.gene
                AND     rxn.reaction = eqn.reaction
                AND     eqn.equation = eng.equation
                AND     eng.dG0 IS NOT NULL
                AND     eqn.equation = sto.equation
        """ % self.GENE_ENERGY_TABLE_NAME
        self.db.Execute(query)

        query = """
            INSERT INTO %s (gene, reaction, dGc, compound, coefficient)
                SELECT  gen.gene, rxn.reaction, -eng.dGc, sto.compound, -sto.coefficient
                FROM    kegg_genes gen, kegg_genes_to_reactions rxn,
                        kegg_equations eqn, kegg_gibbs_energies eng,
                        kegg_stoichiometry sto
                WHERE   gen.organism = 'eco'
                AND     gen.gene = rxn.gene
                AND     rxn.reaction = eqn.reaction
                AND     eqn.equation = eng.equation
                AND     eng.dG0 IS NOT NULL
                AND     eqn.equation = sto.equation
        """ % self.GENE_ENERGY_TABLE_NAME
        self.db.Execute(query)
        self.db.Commit()
        
    def CreateGenePairsTable(self):
        self.db.CreateTable(self.GENE_PAIRS_TABLE_NAME,
                            "gene1 TEXT, gene2 TEXT, reaction1 TEXT, reaction2 TEXT, "
                            "compound TEXT, coeff1 REAL, coeff2 REAL, dGc1 REAL, "
                            "dGc2 REAL, score REAL",
                            drop_if_exists=True)
        self.db.CreateIndex('gene_pairs_gene_idx',
                            self.GENE_PAIRS_TABLE_NAME,
                            'gene1, gene2', unique=False)
        query = """
            INSERT INTO %s (gene1, gene2, reaction1, reaction2, compound, coeff1, coeff2, dGc1, dGc2, score)
                SELECT p.*, pfi.score FROM
                (
                    SELECT  kge1.gene gene1,
                            kge2.gene gene2, 
                            kge1.reaction reaction1,
                            kge2.reaction reaction2,
                            kge1.compound compound, 
                            kge1.coefficient coeff1,
                            kge2.coefficient coeff2,
                            cast(kge1.dGc as real) dGc1, 
                            cast(kge2.dGc as real) dGc2
                    FROM    kegg_gene_energies kge1, kegg_gene_energies kge2
                    WHERE   kge1.compound = kge2.compound
                    AND     kge1.compound NOT IN (SELECT compound FROM %s)
                    AND     kge1.gene != kge2.gene
                    AND     kge1.reaction != kge2.reaction
                    AND     kge1.coefficient > 0
                    AND     kge2.coefficient < 0
                ) p
                LEFT OUTER JOIN %s pfi
                ON      (pfi.gene1 = p.gene1 AND pfi.gene2 = p.gene2
                         OR
                         pfi.gene1 = p.gene2 AND pfi.gene2 = p.gene1)
        """ % (self.GENE_PAIRS_TABLE_NAME, self.COFACTOR_TABLE_NAME, self.FUNCTIONAL_INTERATCTIONS_TABLE)
        self.db.Execute(query)
        self.db.Commit()
        
    def Correlate(self, dGc1_lower, dGc2_upper, reverse=False):
        if reverse:
            cond = "kgp.dGc1 < %d AND kgp.dGc2 > %d" % (dGc1_lower, dGc2_upper)
        else:
            cond = "kgp.dGc1 > %d AND kgp.dGc2 < %d" % (dGc1_lower, dGc2_upper)
        
        query = """
            SELECT  kgp.gene1, kgp.gene2, sum(%s) nqual, count(*) ntot, max(score)
            FROM %s kgp
            GROUP BY kgp.gene1, kgp.gene2
        """ % (cond, self.GENE_PAIRS_TABLE_NAME)
        
        counters = np.zeros((2, 2))
        
        for row in self.db.Execute(query):
            _gene1, _gene2, nqual, _ntot, score = row
            i = int(score is not None) # is there an PP-interaction
            j = int(nqual > 0) # is this a qualifying pair (thermodynamically)
            counters[i, j] += 1.0

        _inter0 = np.sum(counters[0, :])
        inter1 = np.sum(counters[1, :])
        qual0 = np.sum(counters[:, 0])
        qual1 = np.sum(counters[:, 1])
        total = np.sum(counters.flat)
        
        print "-" * 50
        if reverse:
            print "Checking criterion: first < %d and second > %d" % (dGc1_lower, dGc2_upper)
        else:
            print "Checking criterion: first > %d and second < %d" % (dGc1_lower, dGc2_upper)
        print "Total no. of pairs = %d" % total
        
        print "interaction rate among all pairs (%d out of %d) = %.2f%%" % (inter1, total, 100*(inter1 / total))
        print "qualification rate among all pairs (%d out of %d) = %.2f%%" % (qual1, total, 100*(qual1 / total))
        print "interactions between unqualifying pairs (%d out of %d) = %.2f%%" % (counters[1,0], qual0, 100*(counters[1,0] / qual0))
        print "interactions between qualifying pairs (%d out of %d) = %.2f%%" % (counters[1,1], qual1, 100*(counters[1,1] / qual1))
        
        return counters[1,0] / qual0, counters[1,1] / qual1


    def LoadFunctionalInteractions(self,
            fname='../data/proteomics/coli/functional_interactions.txt'):

        self.db.CreateTable(self.FUNCTIONAL_INTERATCTIONS_TABLE,
                            ['gene1', 'gene2', 'score'],
                            drop_if_exists=True)
        self.db.CreateIndex('interaction_gene_idx',
                            self.FUNCTIONAL_INTERATCTIONS_TABLE,
                            'gene1, gene2', unique=False)
        
        tsv = csv.reader(open(fname, 'r'), delimiter='\t')
        for row in tsv:
            if row[0][0] == '#':
                continue
            gene1 = 'eco:' + row[0].lower()
            gene2 = 'eco:' + row[1].lower()
            score = float(row[2])
            self.db.Insert(self.FUNCTIONAL_INTERATCTIONS_TABLE,
                           [gene1, gene2, score])
        
        self.db.Commit()

    def PlotScatter(self):
        query = """
                SELECT  p.g1, p.g2, pfi.score
                FROM (
                      SELECT  kgp.gene1 gene1, kgp.gene2 gene2, cast(kgp.dGc1 as real) g1, cast(kgp.dGc2 as real) g2
                      FROM    %s kgp
                     ) p
                LEFT OUTER JOIN %s pfi
                ON      (pfi.gene1 = p.gene1 AND pfi.gene2 = p.gene2
                         OR
                         pfi.gene1 = p.gene2 AND pfi.gene2 = p.gene1)
            """ % (self.GENE_PAIRS_TABLE_NAME, self.FUNCTIONAL_INTERATCTIONS_TABLE)

        data = []
        for row in self.db.Execute(query):
            g1, g2, score = row
            data.append([float(g1), float(g2), float(score or 0)])
        data = np.matrix(data)

        ind1 = list(np.where(data[:, 2] > 0)[0].flat)
        ind2 = list(np.where(data[:, 2] == 0)[0].flat)
        fig = plt.figure(figsize=(6,6), dpi=90)    
        plt.plot(data[ind2, 0], data[ind2, 1], 'r.', markersize=5, figure=fig)
        plt.plot(data[ind1, 0], data[ind1, 1], 'g.', markersize=5, figure=fig)
        plt.show()
        
    def PlotCDF(self):
        special_pairs = {('eco:b3236', 'eco:b0720'):"mdh:gltA", # malate dehydrogenase -> oxaloacetate -> citrate synthase
                         ('eco:b1263', 'eco:b1264'):"trpD:trpE"} # trpD -> chorismate -> trpE (two components of anthraline synthase)
        
        query = """
                SELECT gene1, gene2, min(dGc2 - dGc1), max(score)
                FROM %s
                WHERE dGc1 + dGc2 < 0
                AND dGc1 > 10
                GROUP BY gene1, gene2
                """ % (self.GENE_PAIRS_TABLE_NAME)

        data = []
        markers = []
        for row in self.db.Execute(query):
            gene1, gene2, ddG, score = row
            if (gene1, gene2) in special_pairs:
                markers.append((special_pairs[(gene1, gene2)], ddG))
            data.append([ddG, float(score or 0)])
        data = np.matrix(data)

        ind1 = list(np.where(data[:, 1] > 0)[0].flat)
        ind2 = list(np.where(data[:, 1] == 0)[0].flat)
    
        fig = plt.figure(figsize=(6,6), dpi=90)    
        cdf((data[ind2, 0]).flat, label="non-interacting (N = %d)" % len(ind2), style='r', figure=fig)
        cdf((data[ind1, 0]).flat, label="interacting (N = %d)" % len(ind1), style='g', figure=fig)
        for label, ddG in markers:
            plt.plot([ddG, ddG], [0, 1], 'b--', figure=fig)
            plt.text(ddG, 0.1, label)
        plt.xlim(-500, 500)
        plt.xlabel(r"$\Delta G'^c$ (2nd) - $\Delta G'^c$ (1st) [kJ/mol]")
        plt.ylabel(r"Cumulative Distribution Function")
        plt.legend(loc="upper left")

        self.html_writer.embed_matplotlib_figure(fig, width=400, height=400, name='channeling_cdf')

    def PrintEnergies(self):
        query = """
                SELECT e.reaction, e.equation, g.dG0, g.dGc 
                FROM   kegg_equations e, kegg_gibbs_energies g 
                WHERE  e.equation = g.equation
                """

        self.html_writer.write('<font size="1">\n')
        column_names = ['KEGG Reaction', 'Formula', 'dG0', 'dGc']
        self.db.Query2HTML(self.html_writer, query, column_names)
        self.db.Query2CSV('../res/channeling_energy_tabel.csv', query, column_names)
        self.html_writer.write('</font>\n')

    def PrintPairs(self):
        query = """
                SELECT g.gene1, g.gene2, c.name, g.reaction1, g.reaction2, 
                       cast(g.dG1 as int), cast(g.dG2 as int), cast(g.ddG as int),
                       kg1.desc, kg2.desc, g.score FROM
                (SELECT gene1, gene2, reaction1, reaction2, compound, max(dGc1) dG1, min(dGc2) dG2, min(dGc2 - dGc1) ddG, max(score) score
                FROM kegg_gene_pairs
                WHERE dGc1 + dGc2 < 1000000
                AND   dGc1 > -1000000
                GROUP BY gene1, gene2, compound
                ORDER BY ddG) g, kegg_genes kg1, kegg_genes kg2, kegg_compounds c
                WHERE g.gene1 = kg1.gene AND g.gene2 = kg2.gene AND c.compound = g.compound
                """
        
        self.html_writer.write('<font size="1">\n')
        column_names = ['Gene 1', 'Gene 2', 'Common Compound',
                        'Reaction 1', 'Reaction 2',
                        'dGc1', 'dGc2', 'dG2-dG1', 'Desc 1', 'Desc 2',
                        'Score']
        self.db.Query2HTML(self.html_writer, query, column_names)
        self.db.Query2CSV('../res/channeling_pairs_table.csv', query, column_names)
        self.html_writer.write('</font>\n')

    def PrintAllPairs(self):
        query = """
                SELECT g.gene1, g.gene2, c.name, g.reaction1, g.reaction2, 
                       g.dG1, g.dG2 FROM
                (SELECT gene1, gene2, reaction1, reaction2, compound, max(dGc1) dG1, min(dGc2) dG2
                FROM kegg_gene_pairs
                GROUP BY gene1, gene2, compound
                ORDER BY gene1, gene2, reaction1, reaction2, compound) g, kegg_genes kg1, kegg_genes kg2, kegg_compounds c
                WHERE g.gene1 = kg1.gene AND g.gene2 = kg2.gene AND c.compound = g.compound
                """
        
        self.html_writer.write('<font size="1">\n')
        column_names = ['Gene 1', 'Gene 2', 'Common Compound',
                        'Reaction 1', 'Reaction 2',
                        'dGc1', 'dGc2']
        self.db.Query2HTML(self.html_writer, query, column_names)
        self.db.Query2CSV('../res/channeling_all_pairs_table.csv', query, column_names)
        self.html_writer.write('</font>\n')