Beispiel #1
0
def main(filename):
  file    = open(filename)
  rna0    = file.readline().strip()
  secStr0 = computeViennaSecStr(rna0)
  D    = {}
  n = len(rna0); num = 0
  line = file.readline()
  numMut = 0; numSamples = 0
  while line:
    if line[0]=='>':
      if numMut == 0: #first time, get number of samples
        numSamples = int(line.split()[2])
      #Now add information for new number of mutations   
      words      = line.split()
      numMut     = int(words[-2]) #new number of mutations
      D[numMut]  = {}
      for i in range(n):
        D[numMut][i] = {}
        for ch in NUCL: D[numMut][i][ch] = 0.0
      num  = 0 #start over counter of number of mutations
      line = file.readline()
      continue
    num    += 1
    rna    = line.strip().upper()
    secStr = file.readline().strip()
    for i in range(n):
      D[numMut][i][rna[i]] += 1.0
    line = file.readline()
  file.close()
  #Normalize D values
  print "Consensus sequences for k-mutants, each k"
  for num in range(1,numMut+1):
    sys.stdout.write(">%d\n" % num)
    for i in range(n): 
      consensusNucl = ''; maxFreq = 0
      for ch in NUCL:
        D[num][i][ch] /= numSamples
        nuclFreq       = D[num][i][ch] #nucleotide frequency
        if nuclFreq>maxFreq:
          maxFreq = nuclFreq; consensusNucl = ch
      sys.stdout.write("%s" % consensusNucl)
    sys.stdout.write("\n")
  E = {} #E[i] is consensus nucleotide in position i over ALL mutations
  print "Consensus sequences over all mutants"
  print "> %d" % (numMut+1)
  for i in range(n):
    E[i] = {}
    for ch in NUCL: E[i][ch] = 0.0
    for num in range(1,numMut+1):
      for ch in NUCL:
	E[i][ch] += D[num][i][ch]
    maxFreq = 0; consensusNucl = ''
    for ch in NUCL:
      E[i][ch] = E[i][ch]/numMut  #normalize
      if E[i][ch]>maxFreq:
	maxFreq = E[i][ch]; consensusNucl = ch
    sys.stdout.write("%s" % consensusNucl)
  sys.stdout.write("\n")
def main(filename):
    file = open(filename)
    rna0 = file.readline().strip()
    secStr0 = computeViennaSecStr(rna0)
    D = {}
    SS = {}
    H = {}  #H is entropy
    n = len(rna0)
    num = 0
    line = file.readline()
    numMut = 0
    numSamples = 0
    while line:
        if line[0] == '>':
            if numMut == 0:  #first time, get number of samples
                numSamples = int(line.split()[2])
            else:  #update entropy H
                H[numMut] = {}
                for i in range(1, n + 1):  #indices 1<=i<=n
                    sumEntropyForI = 0.0  #compute H[i]
                    probII = 1.0  #compute p(i,i) by 1.0 - p(i,j) all j
                    for j in range(1, n + 1):
                        if i != j:
                            probIJ = SS[numMut][i][j] / float(numSamples)
                            sumEntropyForI += -xLogX(probIJ)
                            probII -= probIJ
                    H[numMut][i] = sumEntropyForI + -xLogX(probII)
            #Now add information for new number of mutations
            words = line.split()
            numMut = int(words[-2])  #new number of mutations
            D[numMut] = {}
            SS[numMut] = {}
            for i in range(1, n + 1):
                SS[numMut][i] = {}
                for j in range(1, n + 1):
                    SS[numMut][i][j] = 0.0
                D[numMut][i - 1] = 0.0
                #D[numMut][i] is number of mutations in position i
                #WARNING: indices in D are 0<=i<n, while those in SS are 1<=i<=n
            num = 0  #start over counter of number of mutations
            line = file.readline()
            continue
        num += 1
        if DEBUG: print numMut, num
        rna = line.strip().upper()
        secStr = file.readline().strip()
        bps = basePairList(secStr)
        for i in range(n):
            if rna0[i] != rna[i]: D[numMut][i] += 1.0
            baseI = i + 1  #warning 1<=baseI<=n, but 0<=i<n for accessing RNA string
            for j in range(1, n + 1):
                if baseI < j and (baseI, j) in bps:
                    SS[numMut][baseI][j] += 1
                elif j < baseI and (j, baseI) in bps:
                    SS[numMut][baseI][j] += 1
        line = file.readline()
    file.close()
    #Must complete computation of H for last value of numMut
    #As well, we normalize D values to be between [0,1]
    H[numMut] = {}
    for i in range(1, n + 1):  #indices 1<=i<=n
        sumEntropyForI = 0.0  #compute H[i]
        probII = 1.0  #compute p(i,i) by 1.0 - p(i,j) all j
        for j in range(1, n + 1):
            if i != j:
                probIJ = SS[numMut][i][j] / float(num)
                sumEntropyForI += -xLogX(probIJ)
                probII -= probIJ
        H[numMut][i] = sumEntropyForI + -xLogX(probII)
    #Normalize D values
    for num in range(1, numMut + 1):
        for i in range(n):
            D[num][i] /= numSamples
    #Now compute correlation coefficient
    corrCoeffList = []
    print "k\tcorrCoeff\tmean1\t\tstdev1\t\tmean2\t\tstdev2"
    for num in range(1, numMut + 1):
        L1 = []
        L2 = []
        for i in range(n):
            L1.append(D[num][i])
            L2.append(H[num][i + 1])  #WARNING: indices in H are from 1 to n
        mean1, stdev1, max1, min1 = getSampleStats(L1)
        mean2, stdev2, max2, min2 = getSampleStats(L2)
        print "%d\t%f\t%f\t%f\t%f\t%f" % (num, corrCoeff(
            L1, L2), mean1, stdev1, mean2, stdev2)
Beispiel #3
0
def main(filename):
    file = open(filename)
    rna0 = file.readline().strip()
    secStr0 = computeViennaSecStr(rna0)
    D = {}
    n = len(rna0)
    num = 0
    line = file.readline()
    numMut = 0
    numSamples = 0
    while line:
        if line[0] == '>':
            if numMut == 0:  #first time, get number of samples
                numSamples = int(line.split()[2])
            #Now add information for new number of mutations
            words = line.split()
            numMut = int(words[-2])  #new number of mutations
            D[numMut] = {}
            for i in range(n):
                D[numMut][i] = {}
                for ch in NUCL:
                    D[numMut][i][ch] = 0.0
            num = 0  #start over counter of number of mutations
            line = file.readline()
            continue
        num += 1
        rna = line.strip().upper()
        secStr = file.readline().strip()
        for i in range(n):
            D[numMut][i][rna[i]] += 1.0
        line = file.readline()
    file.close()
    #Normalize D values
    print "Consensus sequences for k-mutants, each k"
    for num in range(1, numMut + 1):
        sys.stdout.write(">%d\n" % num)
        for i in range(n):
            consensusNucl = ''
            maxFreq = 0
            for ch in NUCL:
                D[num][i][ch] /= numSamples
                nuclFreq = D[num][i][ch]  #nucleotide frequency
                if nuclFreq > maxFreq:
                    maxFreq = nuclFreq
                    consensusNucl = ch
            sys.stdout.write("%s" % consensusNucl)
        sys.stdout.write("\n")
    E = {}  #E[i] is consensus nucleotide in position i over ALL mutations
    print "Consensus sequences over all mutants"
    print "> %d" % (numMut + 1)
    for i in range(n):
        E[i] = {}
        for ch in NUCL:
            E[i][ch] = 0.0
        for num in range(1, numMut + 1):
            for ch in NUCL:
                E[i][ch] += D[num][i][ch]
        maxFreq = 0
        consensusNucl = ''
        for ch in NUCL:
            E[i][ch] = E[i][ch] / numMut  #normalize
            if E[i][ch] > maxFreq:
                maxFreq = E[i][ch]
                consensusNucl = ch
        sys.stdout.write("%s" % consensusNucl)
    sys.stdout.write("\n")
def main(filename):
  file    = open(filename)
  rna0    = file.readline().strip()
  secStr0 = computeViennaSecStr(rna0)
  D    = {}; SS = {}; H = {} #H is entropy
  n = len(rna0); num = 0
  line = file.readline()
  numMut = 0; numSamples = 0
  while line:
    if line[0]=='>':
      if numMut == 0: #first time, get number of samples
        numSamples = int(line.split()[2])
      else: #update entropy H
        H[numMut] = {}
        for i in range(1,n+1): #indices 1<=i<=n
          sumEntropyForI = 0.0  #compute H[i]
          probII         = 1.0  #compute p(i,i) by 1.0 - p(i,j) all j
          for j in range(1,n+1):
            if i!=j: 
              probIJ = SS[numMut][i][j]/float(numSamples)
              sumEntropyForI += -xLogX(probIJ)
              probII         -= probIJ
          H[numMut][i] = sumEntropyForI + -xLogX(probII)
      #Now add information for new number of mutations   
      words      = line.split()
      numMut     = int(words[-2]) #new number of mutations
      D[numMut]  = {}
      SS[numMut] = {}
      for i in range(1,n+1):
        SS[numMut][i] = {}
        for j in range(1,n+1): SS[numMut][i][j] = 0.0
        D[numMut][i-1] = 0.0 
         #D[numMut][i] is number of mutations in position i
         #WARNING: indices in D are 0<=i<n, while those in SS are 1<=i<=n
      num  = 0 #start over counter of number of mutations
      line = file.readline()
      continue
    num    += 1
    if DEBUG: print numMut,num 
    rna    = line.strip().upper()
    secStr = file.readline().strip()
    bps    = basePairList(secStr)
    for i in range(n):
      if rna0[i]!=rna[i]: D[numMut][i] += 1.0
      baseI = i+1 #warning 1<=baseI<=n, but 0<=i<n for accessing RNA string
      for j in range(1,n+1):
        if baseI<j and (baseI,j) in bps:
          SS[numMut][baseI][j] += 1
        elif j<baseI and (j,baseI) in bps:
          SS[numMut][baseI][j] += 1
    line = file.readline()
  file.close()
  #Must complete computation of H for last value of numMut
  #As well, we normalize D values to be between [0,1]
  H[numMut] = {}
  for i in range(1,n+1): #indices 1<=i<=n
    sumEntropyForI = 0.0  #compute H[i]
    probII         = 1.0  #compute p(i,i) by 1.0 - p(i,j) all j
    for j in range(1,n+1):
      if i!=j:
        probIJ = SS[numMut][i][j]/float(num)
        sumEntropyForI += -xLogX(probIJ)
        probII         -= probIJ
    H[numMut][i] = sumEntropyForI + -xLogX(probII)
  #Normalize D values
  for num in range(1,numMut+1):
    for i in range(n): D[num][i] /= numSamples
  #Now compute correlation coefficient
  corrCoeffList = []
  print "k\tcorrCoeff\tmean1\t\tstdev1\t\tmean2\t\tstdev2"
  for num in range(1,numMut+1):
    L1 = []; L2 = []
    for i in range(n):
      L1.append(D[num][i])
      L2.append(H[num][i+1]) #WARNING: indices in H are from 1 to n
    mean1,stdev1,max1,min1 = getSampleStats(L1)
    mean2,stdev2,max2,min2 = getSampleStats(L2)
    print "%d\t%f\t%f\t%f\t%f\t%f" % (num,corrCoeff(L1,L2),mean1,stdev1,mean2,stdev2)