Ejemplo n.º 1
0
def summarizeOneFeature(tmDataList, columnName, intervals=50, outName="a.txt"):
  '''takes that column, makes a histogram for each structure'''
  outFile = open(outName, 'w')
  columnNum = tmDataList[0].titleToColumn(columnName)
  treeData = {}
  overallMax = 0.
  for tm3tree in tmDataList:
    data = tm3tree.getListColumn(columnNum)
    overallMax = max(overallMax, max(data))
    treeData[tm3tree] = data
  if intervals == "max":
    intervals = overallMax  # 1 per
  interval = overallMax/intervals  # number of intervals desired
  #print a header
  outFile.write("name\tcount\tmean\tstddev\t")
  currentOut = 0.
  while currentOut < overallMax:
    outFile.write(str(currentOut) + "\t")
    currentOut += interval
  outFile.write("\n")
  for tm3tree in tmDataList:
    tm3data = treeData[tm3tree]
    avgData = statistics.computeMean(tm3data)
    stddevData = statistics.computeStdDev(tm3data, avgData)
    histo, outMax = statistics.computeHistogram(tm3data, interval, overallMax)
    outFile.write(tm3tree.inputFileName + "\t")
    outFile.write(str(len(tm3data)) + "\t")
    outFile.write(str(avgData) + "\t")
    outFile.write(str(stddevData) + "\t")
    for histoCount in histo:
      outFile.write(str(histoCount) + "\t")
    outFile.write("\n")
  outFile.close()
Ejemplo n.º 2
0
def calcColumnsMeanStddev(columnList, tmDataList):
    """returns a dict of column number to mean and another dict to stddev."""
    columnsToMean = {}
    columnsToStddev = {}
    for column in columnList:
        colData = []
        for tmData in tmDataList:
            colData.extend(tmData.getListColumn(column))
        colAvg = statistics.computeMean(colData)
        colStddev = statistics.computeStdDev(colData, colAvg)
        columnsToMean[column] = colAvg
        columnsToStddev[column] = colStddev
    return columnsToMean, columnsToStddev
Ejemplo n.º 3
0
def makeResidueReport(
    residueData, outputFilename="residue.bfactor",
    maxY=False, maxYBeta=False, runGraphs=False):
  #residueNames = residueData.keys()
  residueNames = aminoAcid3Codes
  residueNames.sort()
  fileTemp = open(outputFilename + ".txt", 'w')
  fileTemp.write("ResidueName Mean StdDev Low High Count\n")
  averages, stddevs = {}, {}
  betaAverages, betaStddevs = {}, {}
  for residueName in residueNames:
    #assemble into one big list
    totalList = []
    if residueName in residueData:
      for data in residueData[residueName].values():
        totalList.extend(data)
    average = statistics.computeMean(totalList)
    averages[residueName] = average
    stddev = statistics.computeStdDev(totalList, average)
    stddevs[residueName] = stddev
    betaList = []
    if residueName in residueData:
      data = residueData[residueName]
      betaList.extend(data[carbonBetaCodes[residueName]])
    else:
      data = []
    if len(betaList) > 0:
      betaAvg = statistics.computeMean(betaList)
      betaAverages[residueName] = betaAvg
      betaStddevs[residueName] = statistics.computeStdDev(betaList, betaAvg)
    if len(totalList) > 0:
      fileTemp.write(
          residueName + " " + str(average) + " " + str(stddev) + " " +
          str(min(totalList)) + " " + str(max(totalList)) + " " +
          str(len(totalList)) + "\n")
    else:
      fileTemp.write(
          residueName + " " + str(average) + " " + str(stddev) + " " +
          str(0.) + " " + str(0.) + " " + str(0.) + "\n")
  fileTemp.close()
  if gnuplotAvailable and runGraphs:
    plotter = Gnuplot.Gnuplot(debug=0)
    yLabels = '('
    yData, yError, yMin, yMax = [], [], 10, 0
    yBetaData, yBetaError, yBetaMin, yBetaMax = [], [], 10, 0
    for index, code in enumerate(aminoAcid3Codes):
      yLabels += '"' + str(code) + '" ' + str(index)
      if index != len(aminoAcid3Codes) - 1:
        yLabels += ', '
      if code in averages:
        yData.append(averages[code])
        yError.append(stddevs[code])
        yMin = min(yMin, yData[-1]-yError[-1])
        yMax = max(yMax, yData[-1]+yError[-1])
        yBetaData.append(betaAverages[code])
        yBetaError.append(betaStddevs[code])
        yBetaMin = min(yBetaMin, yBetaData[-1]-yBetaError[-1])
        yBetaMax = max(yBetaMax, yBetaData[-1]+yBetaError[-1])
      else:  # none of that residue
        yData.append(0)
        yError.append(0)
        yBetaData.append(0)
        yBetaError.append(0)
    yLabels += ')'
    graphData = Gnuplot.Data(range(20), yData, yError)
    plotter('set terminal png')
    plotter('set output "' + outputFilename + '.png"')
    plotter('set data style yerrorbars')
    plotter('set boxwidth 0.9 absolute')
    plotter('set xtics ' + yLabels)
    if maxY is False:
      plotter('set yrange [' + str(yMin-0.2) + ':' + str(yMax+0.2) + ']')
    else:
      plotter('set yrange [0:' + str(maxY) + ']')
    plotter('set xrange [-1:20]')
    plotter.xlabel('Residue')
    plotter.ylabel('Mean Travel In Distance')
    plotter.plot(graphData)
    #do another graph with just carbon-betas
    plotter('set output "' + outputFilename + '.beta.png"')
    graphDataBeta = Gnuplot.Data(range(20), yBetaData, yBetaError)
    plotter.ylabel('Mean Travel In Distance of Carbon Beta')
    if maxYBeta is False:
      plotter(
          'set yrange [' + str(yBetaMin-0.2) + ':' + str(yBetaMax+0.2) + ']')
    else:
      plotter('set yrange [0:' + str(maxYBeta) + ']')
    plotter.plot(graphDataBeta)
Ejemplo n.º 4
0
def makeAtomReport(residueData, outputFilename="atom.bfactor", runGraphs=True):
  residueNames = residueData.keys()
  residueNames.sort()
  fileTemp = open(outputFilename + '.txt', 'w')
  fileTemp.write("ResidueName AtomName Mean StdDev Low High Count\n")
  resAtomAverage = {}
  for residueName in residueNames:
    resAtomAverage[residueName] = {}
    atomNames = residueData[residueName].keys()
    atomNames.sort()
    for atomName in atomNames:
      totalList = residueData[residueName][atomName]
      average = statistics.computeMean(totalList)
      resAtomAverage[residueName][atomName] = average
      stddev = statistics.computeStdDev(totalList, average)
      fileTemp.write(
          residueName + " " + atomName + " " + str(average) + " " +
          str(stddev) + " " + str(min(totalList)) + " " +
          str(max(totalList)) + " " + str(len(totalList)) + "\n")
  fileTemp.close()
  if gnuplotAvailable and runGraphs:
    #first make backbone-sidechain report
    plotter = Gnuplot.Gnuplot(debug=0)
    yLabels = '('
    yDataBackbone, yDataSidechain = [], []
    yDataCa, yDataCb = [], []
    for index, code in enumerate(aminoAcid3Codes):
      yLabels += '"' + str(code) + '" ' + str(index)
      if index != len(aminoAcid3Codes) - 1:
        yLabels += ', '
      backValues, sideValues = [], []
      caValues, cbValues = [], []
      try:
        for key, values in residueData[code].iteritems():
          if string.strip(key) in backboneAtomCodes:
            backValues.extend(values)
          else:
            sideValues.extend(values)
          if string.strip(key) == caCode:
            caValues.extend(values)
          elif string.strip(key) == cbCode:
            cbValues.extend(values)
      except KeyError:  # sometimes one residue won't be represented
        pass  # but that is okay
      if len(backValues) == 0:
        yDataBackbone.append(0)
      else:
        yDataBackbone.append(sum(backValues)/float(len(backValues)))
      if len(sideValues) == 0:
        yDataSidechain.append(0)
      else:
        yDataSidechain.append(sum(sideValues)/float(len(sideValues)))
      if len(caValues) == 0:
        yDataCa.append(0)
      else:
        yDataCa.append(sum(caValues)/float(len(caValues)))
      if len(cbValues) == 0:
        yDataCb.append(0)
      else:
        yDataCb.append(sum(cbValues)/float(len(cbValues)))
      if len(backValues + sideValues + caValues + cbValues) > 0:
        makeAminoAcidHistogram(
            plotter, backValues, sideValues, caValues, cbValues,
            outputFilename + "." + str(code))
    yLabels += ')'
    graphDataBackbone = Gnuplot.Data(range(20), yDataBackbone, title="Backbone")
    graphDataSidechain = Gnuplot.Data(
        range(20), yDataSidechain, title="Sidechain")
    graphDataCa = Gnuplot.Data(range(20), yDataCa, title="C-alpha")
    graphDataCb = Gnuplot.Data(range(20), yDataCb, title="C-beta")
    plotter('set terminal png')
    plotter('set output "' + outputFilename + '.png"')
    plotter('set data style points')
    plotter('set key right top')
    plotter('set xtics ' + yLabels)
    plotter(
        'set yrange [' + str(min(yDataBackbone + yDataSidechain) - 0.5) +
        ':' + str(max(yDataBackbone+yDataSidechain)+0.5) + ']')
    plotter('set xrange [-1:20]')
    plotter.xlabel('Residue')
    plotter.ylabel('Mean Travel In Distance')
    plotter.plot(graphDataBackbone, graphDataSidechain)
    plotter('set output "' + outputFilename + '.ab.png"')
    if "buried" in outputFilename:
      plotter('set yrange [' + str(min(yDataCa + yDataCb) - 0.5) + ':6.]')
    else:
      plotter(
          'set yrange [' + str(min(yDataCa + yDataCb) - 0.5) + ':' +
          str(max(yDataCa + yDataCb) + 0.5) + ']')
    plotter.plot(graphDataCa, graphDataCb)
Ejemplo n.º 5
0
def makeCompareResidueReport(
    residueBoth, outputFilename="residue.bfactor", maxY=False, maxYBeta=False,
    numTests=9):
  ranges = [-0.3, 0.6]
  residueNames = []
  for residueName in residueBoth[0].keys() + residueBoth[1].keys():
    if residueName not in residueNames:
      residueNames.append(residueName)
  residueNames.sort()
  #residueNames = aminoAcid3Codes #for now ignore what is in the files
  fileTemp = open(outputFilename + ".txt", 'w')
  fileTemp.write("ResidueName AtomName Mean StdDev Low High Count\n")
  fileTemp2 = open(outputFilename + ".pvals.txt", 'w')
  fileTemp2.write("ResidueName DiffMeans MeanA MeanB pValAbove pValBelow\n")
  fileTemp3 = open(outputFilename + ".pvals.beta.txt", 'w')
  fileTemp3.write("ResidueName DiffMeans MeanA MeanB pValAbove pValBelow\n")
  averages, stddevs = ({}, {}), ({}, {})
  betaAverages, betaStddevs = ({}, {}), ({}, {})
  totalLists, betaLists = ({}, {}), ({}, {})
  for residueName in residueNames:
    totalList = [], []
    betaList = [], []
    for indexSet, residueData in enumerate(residueBoth):
      try:
        for data in residueData[residueName].values():
          totalList[indexSet].extend(data)
        totalLists[indexSet][residueName] = totalList[indexSet]
        average = statistics.computeMean(totalList[indexSet])
        averages[indexSet][residueName] = average
        #print average, residueName
        stddev = statistics.computeStdDev(totalList[indexSet], average)
        stddevs[indexSet][residueName] = stddev
        data = residueData[residueName]
        betaList[indexSet].extend(data[carbonBetaCodes[residueName]])
        betaLists[indexSet][residueName] = betaList[indexSet]
        if len(betaList[indexSet]) > 0:
          betaAvg = statistics.computeMean(betaList[indexSet])
          #print betaAvg, residueName
          betaAverages[indexSet][residueName] = betaAvg
          betaStddevs[indexSet][residueName] = statistics.computeStdDev(
              betaList[indexSet], betaAvg)
        fileTemp.write(
            residueName + " " + str(average) + " " + str(stddev) + " " +
            str(min(totalList)) + " " + str(max(totalList)) + " " +
            str(len(totalList)) + "\n")
      except (ZeroDivisionError, KeyError):
        pass  # probably don't really need this residue anyway
  fileTemp.close()
  for index, code in enumerate(aminoAcid3Codes):  # now do the pvalue tests
    meanA = averages[0][code]
    meanB = averages[1][code]
    listA = totalLists[0][code]
    listB = totalLists[1][code]
    pvals = statistics.pvalueDiffMeans(listA, listB, meanA-meanB, numTests)
    #fileTemp2.write("ResidueName DiffMeans MeanA MeanB pValAbove pValBelow\n")
    fileTemp2.write(code + " " + str(meanA-meanB) + " " + str(meanA) + " ")
    fileTemp2.write(str(meanB) + " " + str(pvals[0]) + " " + str(pvals[1]))
    fileTemp2.write("\n")
    meanA = betaAverages[0][code]
    meanB = betaAverages[1][code]
    listA = betaLists[0][code]
    listB = betaLists[1][code]
    pvals = statistics.pvalueDiffMeans(listA, listB, meanA-meanB, numTests)
    fileTemp3.write(code + " " + str(meanA-meanB) + " " + str(meanA) + " ")
    fileTemp3.write(str(meanB) + " " + str(pvals[0]) + " " + str(pvals[1]))
    fileTemp3.write("\n")
  fileTemp2.close()
  fileTemp3.close()
  if gnuplotAvailable:
    plotter = Gnuplot.Gnuplot(debug=0)
    yLabels = '('
    yData, yError, yMin, yMax = [], [], 10, 0
    yBetaData, yBetaError, yBetaMin, yBetaMax = [], [], 10, 0
    for index, code in enumerate(aminoAcid3Codes):
      yLabels += '"' + str(code) + '" ' + str(index)
      if index != len(aminoAcid3Codes) - 1:
        yLabels += ', '
      yData.append(averages[0][code] - averages[1][code])
      #yError.append(stddevs[0][code])
      #yMin = min(yMin, yData[-1] - yError[-1])
      #yMax = max(yMax, yData[-1] + yError[-1])
      yMin = min(yMin, yData[-1])
      yMax = max(yMax, yData[-1])
      #print betaAverages[0][code]
      #print betaAverages[1][code]
      betaAvgDiff = 0.
      try:
        betaAvg0 = betaAverages[0][code]
        betaAvg1 = betaAverages[1][code]
        betaAvgDiff = betaAvg0 - betaAvg1
      except KeyError:
        print code
        betaAvgDiff = 0.
      yBetaData.append(betaAvgDiff)
      #yBetaError.append(betaStddevs[0][code])
      yBetaMin = min(yBetaMin, yBetaData[-1])
      yBetaMax = max(yBetaMax, yBetaData[-1])
    yLabels += ')'
    graphData = Gnuplot.Data(range(20), yData)
    plotter('set terminal png')
    plotter('set output "' + outputFilename + '.png"')
    plotter('set data style points')
    plotter('set boxwidth 0.9 absolute')
    plotter('set xtics ' + yLabels)
    if ranges:
      plotter('set yrange [' + str(ranges[0]) + ':' + str(ranges[1]) + ']')
    elif maxY is False:
      plotter('set yrange [' + str(yMin-0.2) + ':' + str(yMax+0.2) + ']')
    else:
      plotter('set yrange [0:' + str(maxY) + ']')
    plotter('set xrange [-1:20]')
    plotter.xlabel('Residue')
    plotter.ylabel('Mean Travel In Distance')
    plotter.plot(graphData)
    #do another graph with just carbon-betas
    plotter('set output "' + outputFilename + '.beta.png"')
    graphDataBeta = Gnuplot.Data(range(20), yBetaData)
    plotter.ylabel('Mean Travel In Distance of Carbon Beta')
    if ranges:
      plotter('set yrange [' + str(ranges[0]) + ':' + str(ranges[1]) + ']')
    elif maxYBeta is False:
      plotter(
          'set yrange [' + str(yBetaMin-0.2) + ':' + str(yBetaMax+0.2) + ']')
    else:
      plotter('set yrange [0:' + str(maxYBeta) + ']')
    plotter.plot(graphDataBeta)