Exemple #1
0
def query(release, user, pword, host, port):

    import getIntactDict
    import queryPDB
    import coordMap
    import parseUniChem
    ## Create the pdbDict which stores all the relevant information. The moldict
    ## is a look-up molDict[molregno] = cmpdId. Coordmap is a dictionary with the
    ## coordinate mappings between pdbe and uniprot.

    uniDict = parseUniChem.parse('data/unichemMappings.txt')
    coordMap = coordMap.coordMap('data/pdb_chain_uniprot.csv')
    intactDict = getIntactDict.getIntacts(uniDict, release, user, pword, host,
                                          port)
    pdbDict = queryPDB.queryPDB(uniDict, intactDict, coordMap, release)

    return pdbDict
Exemple #2
0
def query(release, user, pword, host, port): 
                           

  import getIntactDict
  import queryPDB
  import coordMap
  import parseUniChem
  ## Create the pdbDict which stores all the relevant information. The moldict
  ## is a look-up molDict[molregno] = cmpdId. Coordmap is a dictionary with the
  ## coordinate mappings between pdbe and uniprot.

  uniDict = parseUniChem.parse('data/unichemMappings.txt')
  coordMap = coordMap.coordMap()
  intactDict =  getIntactDict.getIntacts(uniDict, release, user, pword, host, port)
  pdbDict = queryPDB.queryPDB(uniDict, intactDict, coordMap,  release)

  return pdbDict                   
Exemple #3
0
def analysis(th, release, user, pword, host, port):

  ####
  #### Load data.
  ####
  
  ## Set threshold for all calculations.
  import numpy as np
  threshold = -np.log10(th*10**(-6))


  ## Get all ChEMBL targets with a Uniprot accession.
  import getUniprotTargets
  chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port)
  
  ## Read all human protein coding genes
  import parse
  humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0)
  #humanTargets = humanProtCodUniq.keys()
  print "We are dealing with %s human proteins" %len(humProtCod.keys())

  ## Get a list of all human (!) ChEMBL targets
  humChembl = {}
  for target in chemblTargets:
    if target in humProtCod.keys():
      humChembl[target] = 0

  ## Load the pfamDict.
  import pickle
  inFile = open('data/protCodPfamDict_%s.pkl' %release, 'r')
  pfamDict = pickle.load(inFile)
  inFile.close() 

  ## Load the pdbDict.
  import pickle
  infile = open('data/pdbDict_chembl%s.pkl' %release, 'r')
  pdbDict = pickle.load(infile)
  infile.close()

  ## Load the uniprotDict.
  import pickle
  infile  = open('data/bsDictUniprot_chembl%s.pkl'%release, 'r')
  uniprotDict = pickle.load(infile)
  infile.close()
  print 'number of targets with binding site information', len(uniprotDict.keys())


  ## Load the uniDict.
  import parseUniChem
  uniDict = parseUniChem.parse('data/unichemMappings.txt')

  ## Load the propDict.
  import pickle
  infile = open('data/propDict_%s.pkl'% release, 'r')
  propDict = pickle.load(infile)
  infile.close()

  ####
  #### Generate Plots.
  ####

  ## For each target in PfamDict, calculate the ratio of domain over non-domain regions.
  import getRatioUnstruct
  import writeTable
  import os
  pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user, pword, host, port)
  writeTable.writePfam(pfamDict, humProtCod,humChembl, chemblTargets, release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R' %release) 


  ## Assess small molecule binding within Pfam domains for PDBe entries.
  import matchData
  import evaluatePred 
  pdbDict = matchData.pdbe(pdbDict,pfamDict, release)
  evaluatePred.pdbe(pdbDict, 'within', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R' % ('within', "PDB" , release))

  
  ## Assess small molecule binding within Pfam domains for Uniprot entries.  
  import matchData
  import evaluatePred  
  uniprotDict = matchData.uniprot(uniprotDict,pfamDict,  release)
  evaluatePred.uniprot(uniprotDict, 'within', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R' % ('within', "Uni" , release))

  ## Print a summary of the number of targets and domains covered by the mapping. 
  import groupSize
  import os
  allDomains = groupSize.uniqueDomains(pfamDict)
  singleDomains = groupSize.singles(chemblTargets, pfamDict)
  groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles)
  print "all possible groups (single, none, multi, conflict):",groupsAll
  (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release, user , pword, host, port)
  print "all covered targets (single, multi, conflict): ", len(single), len(multi), len(conflict)
  (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release, user , pword, host, port)
  print "all covered targets (single, multi, conflict): ", len(single), len(multi),len(conflict)


  ## Plot the evaluation of the mappings.
  import queryDevice
  import matchData
  import evaluatePred
  import os

  intacts = queryDevice.queryDevice("SELECT mpf.protein_accession,mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype, md.chembl_id FROM map_pfam mpf JOIN pfam_domains pfd ON pfd.protein_accession = mpf.protein_accession JOIN molecule_dictionary md ON md.molregno = mpf.molregno WHERE mpf.domain = pfd.domain", release, user, pword, host, port)

  # ...against PDBe  
  pdbDict = matchData.pdbePredicted(pdbDict,  intacts, uniDict)
  evaluatePred.pdbePredicted(pdbDict, 'prediction', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R' % ('prediction', 'PDB' , release))
  # ...against uniprot
  uniprotDict = matchData.uniprotPredicted(uniprotDict,  intacts)
  evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R' % ('prediction', "Uni" , release))


  ## Map the overlap
  #import overlap
  #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001]
  #overlap.overlap(propDict, tholds, release)  


  ## Power Law Distribution of domain occurences
  ##  Prepare the data for the power law plot.
  ##  1. Count the targets and compounds per domain using the propDict
  ##  2. Count a human genes per domain using the Pfam dictionary
  ##  3. Plot the power law distributions for all domains and overlay 25 most 
  ##     frequent domains
  import countFreqs
  import plplot
  import plplotRaw
  import parse 
  countFreqs.countLigs(humProtCod.keys(), chemblTargets, release ,user, pword, host, port)
  countFreqs.countDoms(humProtCod.keys(), pfamDict)
  filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab']

  for filename in filenames:
    os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R' %filename)
    al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename)
    freqs = parse.col2intlist('data/%s'%filename, 1, True)
    print len(freqs), minx, al, filename, type(freqs), type(freqs[1])
    plplot.plplot(freqs, minx, al, filename)
    plplotRaw.plplotRaw(freqs, filename) 


  ## Plot the ligand properties.
  import export
  import os
  selected = ['Pkinase','Pkinase_Tyr','p450','SNF','Trypsin', 'RVP']
  export.exportProps(selected,propDict, threshold, release, user, pword, host, port) 

  filename = 'data/cmpdProps_pKi%s_chembl%s.tab'%(int(threshold), release)
  os.system("/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R"%filename)
Exemple #4
0
def analysis(release):

    ####
    #### Load parameters.
    ####

    import yaml
    # Read config file.
    paramFile = open('mpf.yaml')
    params = yaml.safe_load(paramFile)
    user = params['user']
    pword = params['pword']
    host = params['host']
    port = params['port']
    th = params['threshold']

    ####
    #### Load data.
    ####

    ## Set threshold for all calculations.
    import numpy as np
    threshold = -np.log10(th * 10**(-6))

    ## Get all ChEMBL targets with a Uniprot accession.
    import getUniprotTargets
    chemblTargets = getUniprotTargets.getUniprotTargets(
        release, user, pword, host, port)

    ## Get a list of all human (!) ChEMBL targets
    humChembl = {}
    for target in chemblTargets.keys():
        if chemblTargets[target] == 'H**o sapiens':
            humChembl[target] = 0

    ## Read all human protein coding genes
    import parse
    humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0)
    #humanTargets = humanProtCodUniq.keys()
    print "We are dealing with %s human proteins" % len(humProtCod.keys())

    ## Load the pfamDict.
    import pickle
    inFile = open('data/protCodPfamDict_%s.pkl' % release, 'r')
    pfamDict = pickle.load(inFile)
    inFile.close()

    ## Load the pdbDict.
    import pickle
    infile = open('data/pdbDict_%s.pkl' % release, 'r')
    pdbDict = pickle.load(infile)
    infile.close()

    ## Load the uniprotDict.
    import pickle
    infile = open('data/bsDictUniprot_%s.pkl' % release, 'r')
    uniprotDict = pickle.load(infile)
    infile.close()
    print 'number of targets with binding site information', len(
        uniprotDict.keys())

    ## Load the uniDict.
    import parseUniChem
    uniDict = parseUniChem.parse('data/unichemMappings.txt')

    ## Load the propDict.
    import pickle
    infile = open('data/propDict_%s.pkl' % release, 'r')
    propDict = pickle.load(infile)
    infile.close()

    ####
    #### Generate Plots.
    ####

    ## For each target in PfamDict, calculate the ratio of domain over non-domain regions.
    import getRatioUnstruct
    import writeTable
    import os
    pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user,
                                         pword, host, port)
    writeTable.writePfam(pfamDict, humProtCod, humChembl, chemblTargets,
                         release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R'
        % release)

    ## Assess small molecule binding within Pfam domains for PDBe entries.
    import matchData
    import evaluatePred
    pdbDict = matchData.pdbe(pdbDict, pfamDict, release)
    evaluatePred.pdbe(pdbDict, 'within', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R'
        % ('within', "PDB", release))

    ## Assess small molecule binding within Pfam domains for Uniprot entries.
    import matchData
    import evaluatePred
    uniprotDict = matchData.uniprot(uniprotDict, pfamDict, release)
    evaluatePred.uniprot(uniprotDict, 'within', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R'
        % ('within', "Uni", release))

    ## Print a summary of the number of targets and domains covered by the mapping.
    import groupSize
    import os
    allDomains = groupSize.uniqueDomains(pfamDict)
    singleDomains = groupSize.singles(chemblTargets, pfamDict)
    groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles)
    print "all possible groups (single, none, multi, conflict):", groupsAll
    (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release,
                                                       user, pword, host, port)
    print "all covered targets (single, multi, conflict): ", len(single), len(
        multi), len(conflict)
    (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release,
                                                     user, pword, host, port)
    print "all covered targets (single, multi, conflict): ", len(single), len(
        multi), len(conflict)

    ## Plot the evaluation of the mappings.
    import queryDevice
    import matchData
    import evaluatePred
    import os

    intacts = queryDevice.queryDevice(
        """SELECT mpf.protein_accession,
		mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype,
	 	md.chembl_id FROM map_pfam mpf 
	JOIN pfam_domains pfd 
	  ON pfd.protein_accession = mpf.protein_accession 
	JOIN molecule_dictionary md 
	  ON md.molregno = mpf.molregno 
	WHERE mpf.domain = pfd.domain""", release, user, pword, host, port)

    # ...against PDBe
    pdbDict = matchData.pdbePredicted(pdbDict, intacts, uniDict)
    evaluatePred.pdbePredicted(pdbDict, 'prediction', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R'
        % ('prediction', 'PDB', release))
    # ...against uniprot
    uniprotDict = matchData.uniprotPredicted(uniprotDict, intacts)
    evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R'
        % ('prediction', "Uni", release))

    ## Map the overlap
    #import overlap
    #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001]
    #overlap.overlap(propDict, tholds, release)

    ## Power Law Distribution of domain occurences
    ##  Prepare the data for the power law plot.
    ##  1. Count the targets and compounds per domain using the propDict
    ##  2. Count a human genes per domain using the Pfam dictionary
    ##  3. Plot the power law distributions for all domains and overlay 25 most
    ##     frequent domains
    import countFreqs
    import plplot
    import plplotRaw
    import parse
    countFreqs.countLigs(humProtCod.keys(), chemblTargets, release, user,
                         pword, host, port)
    countFreqs.countDoms(humProtCod.keys(), pfamDict)
    filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab']

    for filename in filenames:
        os.system(
            '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R'
            % filename)
        al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename)
        freqs = parse.col2intlist('data/%s' % filename, 1, True)
        print len(freqs), minx, al, filename, type(freqs), type(freqs[1])
        plplot.plplot(freqs, minx, al, filename)
        plplotRaw.plplotRaw(freqs, filename)

    ## Plot the ligand properties.
    import export
    import os
    selected = ['Pkinase', 'Pkinase_Tyr', 'p450', 'SNF', 'Trypsin', 'RVP']
    export.exportProps(selected, propDict, threshold, release, user, pword,
                       host, port)

    filename = 'data/cmpdProps_pKi%s_chembl%s.tab' % (int(threshold), release)
    os.system(
        "/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R"
        % filename)