Ejemplo n.º 1
0
def power_law(x, variable, subject, radius = 0.5, number_of_sets = 100):
    print "- Fitting power law to empirical data: %s" % variable
    if sum(x-numpy.floor(x)):
        print "  CONTINUOUS"
        alpha_range = None
    else:
        alpha, xmin = plfit0.plfit0(x)
        print "  DISCRETE"
        print "  Approximate estimator for the scaling parameter of the discrete power law:"
        print "  * Scaling parameter: alpha %g" % alpha
        print "  * Lower bound: xmin %g" % xmin
        alpha_range = numpy.arange(round(alpha)-radius, round(alpha)+radius, 0.001)
        alpha_range = alpha_range[alpha_range > 1] # distributions with alpha <=1 are not normalizable
    alpha, xmin, L = plfit.plfit(x, vec = alpha_range, nosmall = False, finite = True)
    print "  Numerical maximization of the logarithm of the likelihood function L:"
    print "  * Scaling parameter: alpha %g" % alpha
    try:
        if alpha == min(alpha_range) or alpha == max(alpha_range):
            print "    WARNING alpha_range"
    except TypeError:
        pass
    print "  * Lower bound: xmin %g" % xmin
    print "  * Logarithm of the likelihood function: L %g" % L
    p, gof = plpva.plpva(x, xmin, vec=alpha_range, reps=number_of_sets, quiet=True)
    print "  Generation of %d power-law distributed synthetic data sets:" % number_of_sets
    print "  * Fraction of data sets with worse KS statistic than the empirical data: p-value %g" % p
    print "  * KS statistic of the empirical data: D %g" % gof
    png = "plplot_"+subject
    plplot.plplot(x, xmin, alpha, variable, p, png)
Ejemplo n.º 2
0
def analysis(th, release, user, pword, host, port):

  ####
  #### Load data.
  ####
  
  ## Set threshold for all calculations.
  import numpy as np
  threshold = -np.log10(th*10**(-6))


  ## Get all ChEMBL targets with a Uniprot accession.
  import getUniprotTargets
  chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port)
  
  ## Read all human protein coding genes
  import parse
  humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0)
  #humanTargets = humanProtCodUniq.keys()
  print "We are dealing with %s human proteins" %len(humProtCod.keys())

  ## Get a list of all human (!) ChEMBL targets
  humChembl = {}
  for target in chemblTargets:
    if target in humProtCod.keys():
      humChembl[target] = 0

  ## Load the pfamDict.
  import pickle
  inFile = open('data/protCodPfamDict_%s.pkl' %release, 'r')
  pfamDict = pickle.load(inFile)
  inFile.close() 

  ## Load the pdbDict.
  import pickle
  infile = open('data/pdbDict_chembl%s.pkl' %release, 'r')
  pdbDict = pickle.load(infile)
  infile.close()

  ## Load the uniprotDict.
  import pickle
  infile  = open('data/bsDictUniprot_chembl%s.pkl'%release, 'r')
  uniprotDict = pickle.load(infile)
  infile.close()
  print 'number of targets with binding site information', len(uniprotDict.keys())


  ## Load the uniDict.
  import parseUniChem
  uniDict = parseUniChem.parse('data/unichemMappings.txt')

  ## Load the propDict.
  import pickle
  infile = open('data/propDict_%s.pkl'% release, 'r')
  propDict = pickle.load(infile)
  infile.close()

  ####
  #### Generate Plots.
  ####

  ## For each target in PfamDict, calculate the ratio of domain over non-domain regions.
  import getRatioUnstruct
  import writeTable
  import os
  pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user, pword, host, port)
  writeTable.writePfam(pfamDict, humProtCod,humChembl, chemblTargets, release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R' %release) 


  ## Assess small molecule binding within Pfam domains for PDBe entries.
  import matchData
  import evaluatePred 
  pdbDict = matchData.pdbe(pdbDict,pfamDict, release)
  evaluatePred.pdbe(pdbDict, 'within', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R' % ('within', "PDB" , release))

  
  ## Assess small molecule binding within Pfam domains for Uniprot entries.  
  import matchData
  import evaluatePred  
  uniprotDict = matchData.uniprot(uniprotDict,pfamDict,  release)
  evaluatePred.uniprot(uniprotDict, 'within', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R' % ('within', "Uni" , release))

  ## Print a summary of the number of targets and domains covered by the mapping. 
  import groupSize
  import os
  allDomains = groupSize.uniqueDomains(pfamDict)
  singleDomains = groupSize.singles(chemblTargets, pfamDict)
  groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles)
  print "all possible groups (single, none, multi, conflict):",groupsAll
  (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release, user , pword, host, port)
  print "all covered targets (single, multi, conflict): ", len(single), len(multi), len(conflict)
  (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release, user , pword, host, port)
  print "all covered targets (single, multi, conflict): ", len(single), len(multi),len(conflict)


  ## Plot the evaluation of the mappings.
  import queryDevice
  import matchData
  import evaluatePred
  import os

  intacts = queryDevice.queryDevice("SELECT mpf.protein_accession,mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype, md.chembl_id FROM map_pfam mpf JOIN pfam_domains pfd ON pfd.protein_accession = mpf.protein_accession JOIN molecule_dictionary md ON md.molregno = mpf.molregno WHERE mpf.domain = pfd.domain", release, user, pword, host, port)

  # ...against PDBe  
  pdbDict = matchData.pdbePredicted(pdbDict,  intacts, uniDict)
  evaluatePred.pdbePredicted(pdbDict, 'prediction', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R' % ('prediction', 'PDB' , release))
  # ...against uniprot
  uniprotDict = matchData.uniprotPredicted(uniprotDict,  intacts)
  evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R' % ('prediction', "Uni" , release))


  ## Map the overlap
  #import overlap
  #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001]
  #overlap.overlap(propDict, tholds, release)  


  ## Power Law Distribution of domain occurences
  ##  Prepare the data for the power law plot.
  ##  1. Count the targets and compounds per domain using the propDict
  ##  2. Count a human genes per domain using the Pfam dictionary
  ##  3. Plot the power law distributions for all domains and overlay 25 most 
  ##     frequent domains
  import countFreqs
  import plplot
  import plplotRaw
  import parse 
  countFreqs.countLigs(humProtCod.keys(), chemblTargets, release ,user, pword, host, port)
  countFreqs.countDoms(humProtCod.keys(), pfamDict)
  filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab']

  for filename in filenames:
    os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R' %filename)
    al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename)
    freqs = parse.col2intlist('data/%s'%filename, 1, True)
    print len(freqs), minx, al, filename, type(freqs), type(freqs[1])
    plplot.plplot(freqs, minx, al, filename)
    plplotRaw.plplotRaw(freqs, filename) 


  ## Plot the ligand properties.
  import export
  import os
  selected = ['Pkinase','Pkinase_Tyr','p450','SNF','Trypsin', 'RVP']
  export.exportProps(selected,propDict, threshold, release, user, pword, host, port) 

  filename = 'data/cmpdProps_pKi%s_chembl%s.tab'%(int(threshold), release)
  os.system("/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R"%filename)
Ejemplo n.º 3
0
#plothist = []
#plothist = np.append(plothist,50. *np.ones(36))
#plothist = np.append(plothist,71. *np.ones(18) )
#plothist = np.append(plothist,100. *np.ones(12) )
#plothist = np.append(plothist,141. *np.ones(6) )
#plothist = np.append(plothist,200.*np.ones(4) )
#plothist = np.append(plothist,282.*np.ones(1)  )
#plothist = np.append(plothist,400.*np.ones(2)  )

exit()

plt.figure(3)
[alpha, xmin, L] = plfit.plfit(plothist,'xmin',30)#50.)
print alpha,xmin
#a = plpva.plpva(plothist,30,'xmin',30)
h = plplot.plplot(plothist,xmin,alpha)
plt.loglog(h[0], h[1], 'k--',linewidth=2)
plt.hist(    plothist,\
             log=True,\
             bins=zebins,\
#             cumulative=-1,\
             normed=True,\
        )
plt.xscale('log')
plt.xlabel('Pressure (micro Pa)')
plt.ylabel('Population (normalized)')
myp.makeplotres("data",res=200,disp=False)

#plt.figure(4)
#[alpha, xmin, L] = plfit.plfit(plothist,'xmin',50.) #,'xmin',30.)
#print alpha,xmin
Ejemplo n.º 4
0
def analysis(release):

    ####
    #### Load parameters.
    ####

    import yaml
    # Read config file.
    paramFile = open('mpf.yaml')
    params = yaml.safe_load(paramFile)
    user = params['user']
    pword = params['pword']
    host = params['host']
    port = params['port']
    th = params['threshold']

    ####
    #### Load data.
    ####

    ## Set threshold for all calculations.
    import numpy as np
    threshold = -np.log10(th * 10**(-6))

    ## Get all ChEMBL targets with a Uniprot accession.
    import getUniprotTargets
    chemblTargets = getUniprotTargets.getUniprotTargets(
        release, user, pword, host, port)

    ## Get a list of all human (!) ChEMBL targets
    humChembl = {}
    for target in chemblTargets.keys():
        if chemblTargets[target] == 'H**o sapiens':
            humChembl[target] = 0

    ## Read all human protein coding genes
    import parse
    humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0)
    #humanTargets = humanProtCodUniq.keys()
    print "We are dealing with %s human proteins" % len(humProtCod.keys())

    ## Load the pfamDict.
    import pickle
    inFile = open('data/protCodPfamDict_%s.pkl' % release, 'r')
    pfamDict = pickle.load(inFile)
    inFile.close()

    ## Load the pdbDict.
    import pickle
    infile = open('data/pdbDict_%s.pkl' % release, 'r')
    pdbDict = pickle.load(infile)
    infile.close()

    ## Load the uniprotDict.
    import pickle
    infile = open('data/bsDictUniprot_%s.pkl' % release, 'r')
    uniprotDict = pickle.load(infile)
    infile.close()
    print 'number of targets with binding site information', len(
        uniprotDict.keys())

    ## Load the uniDict.
    import parseUniChem
    uniDict = parseUniChem.parse('data/unichemMappings.txt')

    ## Load the propDict.
    import pickle
    infile = open('data/propDict_%s.pkl' % release, 'r')
    propDict = pickle.load(infile)
    infile.close()

    ####
    #### Generate Plots.
    ####

    ## For each target in PfamDict, calculate the ratio of domain over non-domain regions.
    import getRatioUnstruct
    import writeTable
    import os
    pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user,
                                         pword, host, port)
    writeTable.writePfam(pfamDict, humProtCod, humChembl, chemblTargets,
                         release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R'
        % release)

    ## Assess small molecule binding within Pfam domains for PDBe entries.
    import matchData
    import evaluatePred
    pdbDict = matchData.pdbe(pdbDict, pfamDict, release)
    evaluatePred.pdbe(pdbDict, 'within', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R'
        % ('within', "PDB", release))

    ## Assess small molecule binding within Pfam domains for Uniprot entries.
    import matchData
    import evaluatePred
    uniprotDict = matchData.uniprot(uniprotDict, pfamDict, release)
    evaluatePred.uniprot(uniprotDict, 'within', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R'
        % ('within', "Uni", release))

    ## Print a summary of the number of targets and domains covered by the mapping.
    import groupSize
    import os
    allDomains = groupSize.uniqueDomains(pfamDict)
    singleDomains = groupSize.singles(chemblTargets, pfamDict)
    groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles)
    print "all possible groups (single, none, multi, conflict):", groupsAll
    (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release,
                                                       user, pword, host, port)
    print "all covered targets (single, multi, conflict): ", len(single), len(
        multi), len(conflict)
    (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release,
                                                     user, pword, host, port)
    print "all covered targets (single, multi, conflict): ", len(single), len(
        multi), len(conflict)

    ## Plot the evaluation of the mappings.
    import queryDevice
    import matchData
    import evaluatePred
    import os

    intacts = queryDevice.queryDevice(
        """SELECT mpf.protein_accession,
		mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype,
	 	md.chembl_id FROM map_pfam mpf 
	JOIN pfam_domains pfd 
	  ON pfd.protein_accession = mpf.protein_accession 
	JOIN molecule_dictionary md 
	  ON md.molregno = mpf.molregno 
	WHERE mpf.domain = pfd.domain""", release, user, pword, host, port)

    # ...against PDBe
    pdbDict = matchData.pdbePredicted(pdbDict, intacts, uniDict)
    evaluatePred.pdbePredicted(pdbDict, 'prediction', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R'
        % ('prediction', 'PDB', release))
    # ...against uniprot
    uniprotDict = matchData.uniprotPredicted(uniprotDict, intacts)
    evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R'
        % ('prediction', "Uni", release))

    ## Map the overlap
    #import overlap
    #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001]
    #overlap.overlap(propDict, tholds, release)

    ## Power Law Distribution of domain occurences
    ##  Prepare the data for the power law plot.
    ##  1. Count the targets and compounds per domain using the propDict
    ##  2. Count a human genes per domain using the Pfam dictionary
    ##  3. Plot the power law distributions for all domains and overlay 25 most
    ##     frequent domains
    import countFreqs
    import plplot
    import plplotRaw
    import parse
    countFreqs.countLigs(humProtCod.keys(), chemblTargets, release, user,
                         pword, host, port)
    countFreqs.countDoms(humProtCod.keys(), pfamDict)
    filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab']

    for filename in filenames:
        os.system(
            '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R'
            % filename)
        al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename)
        freqs = parse.col2intlist('data/%s' % filename, 1, True)
        print len(freqs), minx, al, filename, type(freqs), type(freqs[1])
        plplot.plplot(freqs, minx, al, filename)
        plplotRaw.plplotRaw(freqs, filename)

    ## Plot the ligand properties.
    import export
    import os
    selected = ['Pkinase', 'Pkinase_Tyr', 'p450', 'SNF', 'Trypsin', 'RVP']
    export.exportProps(selected, propDict, threshold, release, user, pword,
                       host, port)

    filename = 'data/cmpdProps_pKi%s_chembl%s.tab' % (int(threshold), release)
    os.system(
        "/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R"
        % filename)