def distanceMatrixCorrelation(matrix1, matrix2, weights = None, collectComponents = False): """ :param matrix1: :param matrix2: :param weights: :return: mean, and STD of the Kendal Tau Distances between all rows, and sorted list of names in the order of better correlations """ size = matrix1.getSize() assert(size == matrix2.getSize()) assert((not weights) or (size == weights.getSize())) kendallList = [None] * size weightsAllOnes = [1.0] * size compDict = DefDict(list) compSet = set() if collectComponents: for vl in matrix1.getArray(): for v in vl: compSet.add(v) for i in range(size): components = DefDict(float) kendallList[i] = calculateWeightedKendall(matrix1[i], matrix2[i], weights = weights[i] if weights else None, components = components if collectComponents else None) for k in compSet: compDict[k].append(components[k]) sortedNames = sorted(zip(matrix1.names, kendallList), key = operator.itemgetter(1)) compList = None if collectComponents: compList = map(np.mean, map(operator.itemgetter(1), sorted(compDict.iteritems(), key = operator.itemgetter(0)))) return (np.mean(kendallList), np.std(kendallList), sortedNames, compList)
def voteRank(sequences, motifs): poll = {} for seq in sequences: poll[seq] = [0.0] * len(sequences[seq]) # perform poll for tool in motifs: for motif in motifs[tool]: for seq in motifs[tool][motif]: sequence = best(sequences, seq) for pos in motifs[tool][motif][seq]: for i in xrange(pos, pos + len(motif)): try: # instead of weighting all results the same (1), we # could bias based on tool or number of results or something like that poll[sequence][i - 1] += 1 except Exception as e: print e print 'It appears a tool has reported finding a motif',\ 'outside the bounds of a sequence' print 'such as finding a motif of length 10 at position',\ '195 in a sequence with length 200' pdb.set_trace() # add up votes for each motif ress = DD(int) for tool in motifs: for motif in motifs[tool]: for seq in motifs[tool][motif]: for pos in motifs[tool][motif][seq]: for p in xrange(pos, pos + len(motif)): ress[motif] += poll[best(sequences, seq)][p-1] # sort motifs by number of votes return sorted(map(lambda a: list(a[::-1]), ress.iteritems()))
OUT1.write("%s\t%0.1f\t%0.1f\t%s\t%s\n" % (transpos, RefMet[transpos], met, "nada", "nada")) # OUT1.write("%s\t%s\t%0.1f\t%s\t%s\t%s\n" % (transpos, RefMet[transpos], met, x[4], x[5], RefSeq[transpos]) ) FoundMet[transpos] = met found += 1 else: # Option for dumping all other 5mC calls . . . . . . met = 100 * (x[1] / x[2]) # headers = "Pos\tExpMet\tObsMet\tMETscore\tUMTscore\tSeq\n" OUT2.write("%s\t%0.1f\t%0.1f\t%s\t%s\tnada\n" % (transpos, RefMet[transpos], met, "nada", "nada")) IN.close() OUT1.close() OUT2.close() OUT3 = open(results + OUTlost, 'w') OUT3.write("POS\tpMET\tLOST\n") lost = 0 for (pos, count) in RefMet.iteritems(): if count > 0: if FoundMet[pos] < 0: OUT3.write("%s\t%s\t0\n" % (pos, RefMet[pos])) lost += 1 OUT3.close() print " FOUND = ", found, " ; LOST = ", lost print "\n\n\n * * * * * D O N E * * * * * * \n\n\n" # EOF ------------------------------------------------------------------------
cogDist = DefDict(dict) for ordinal, (dir1, cs1) in enumerate(cogDict.iteritems(), start = 1): print("\r%d. %s" % (ordinal, dir1)), for dir2, cs2 in cogDict.iteritems(): cogDist[dir1][dir2] = cogDistFunc(cs1, cs2) print("\nBuilding average distances for TaxaTypes...") # Genome dir -> dict of {taxaTypes -> avg COG distance to dir} dirTaxaTypeDictDict = DefDict(lambda: DefDict(list)) for ordinal, dir1 in enumerate(taxaDict.keys(), start = 1): print("\r%d. %s" % (ordinal, dir1)), for dir2, taxa in taxaDict.iteritems(): dirTaxaTypeDictDict[dir1][repr(taxa.type)].append(cogDist[dir1][dir2]) print("\nRebuilding dirTaxaTypeDictDict to get UtilNormDistribs...") for dir, d in dirTaxaTypeDictDict.iteritems(): # Find global weighted STD std = 0. totalLen = 0 for taxaTypeStr, distList in d.iteritems(): if len(distList) >= 2: val = np.std(distList, ddof = 1.) std += val * val * len(distList) totalLen += len(distList) if totalLen == 0: raise ValueError("Cannot calcuate global std for %s" % dir) std /= totalLen for taxaTypeStr, distList in d.iteritems(): localStd = np.std(distList) localStd *= localStd
FILE[i] = FILE[i].rstrip() fragSeq = ''.join(FILE) fracGC = fragSeq.count('C') + fragSeq.count('G') genFragLen = len(fragSeq) # Generate refMettable to work with. . . . . . . . index = 0 while (index > -1): index = fragSeq.find('CCGG', index + 3) if index > -1 and index < genFragLen - 10: METtable[index] = random.choice(methylstates) else: break Mettable = open(refMetTable, 'w') for pos, pcnt in METtable.iteritems(): Mettable.write("%s\t%0.2f\n" % (pos + 1, pcnt)) Mettable.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # . . . Process differentially methylated copies . . . . # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if (ReadGenFile == 0): # Assign target quantitative state to each CCGG site in the sample . . . . fracGC = 100 * float(fracGC) / float(genFragLen) countCCGG = 0 index = 7 while (index > -1): index = fragSeq.find('CCGG', index + 3) if index > -1 and index < genFragLen - 10:
count += 1 if count % 1000 == 0: print count, print print k,"Done!!" ## Print Dict print "Printing Dictionary" for i,j in contextDict.iteritems(): frequenciesFile.write(i[0]+"\t"+i[1]+"\t"+str(j)+"\n") frequenciesFile.close() ## Print Test, Indices print "printing PureTrainFile" for i in trainIndices: pureTrainFile.write(word_tag(enLines[i])) pureTrainFile.write(word_tag(frLines[i])) pureTrainFile.close() print "Printing Testfile"
def voteRefine(sequences, motifs): #get probabilities lets = "ACGT" probability = DD(int) for seq in sequences: for let in sequences[seq]: probability[let] += 1 s = sum(probability.values()) for let in lets: probability[let] = probability[let] / float(s) #conductPoll poll = {} maxV = 0 maxL = 0 for seq in sequences: poll[seq] = [0.0] * len(sequences[seq]) if len(sequences[seq]) > maxL: maxL = len(sequences[seq]) for tool in motifs: for motif in motifs[tool]: for seq in motifs[tool][motif]: sequence = best(sequences, seq) for pos in motifs[tool][motif][seq]: for i in xrange(pos, pos + len(motif)): try: # instead of weighting all results the same (1), we # could bias based on tool or number of results or something like that #poll[sequence][i - 1] += 1 if tool == "CMF": poll[sequence][i - 1] += 1 if tool == "Weeder": poll[sequence][i - 1] += 1 if tool == "MEME": poll[sequence][i - 1] += 1 if tool == "DECOD": poll[sequence][i - 1] += 1 if tool == "BioProspector": poll[sequence][i - 1] += 1 if tool == "XXmotif": poll[sequence][i - 1] += 1 except Exception as e: print e print 'It appears a tool has reported finding a motif',\ 'outside the bounds of a sequence' print 'such as finding a motif of length 10 at position',\ '195 in a sequence with length 200' pdb.set_trace() if poll[sequence][i - 1] > maxV: maxV = poll[sequence][i - 1] #inspectPoll ress = [] THRESH = 3.7 maxInsts = 0 MLEN = MOTIF_LEN for seq in poll: for i in xrange(len(poll[seq]) - MLEN): if sum(poll[seq][i:i + MLEN]) >= MLEN * THRESH: curr = sequences[seq][i:i + MLEN] bestPWM = None bestMatching = 0 for PWM in ress: matching = compMotifPWM(curr, PWM) if matching > bestMatching and matching > MLEN / 2: bestMatching = matching bestPWM = PWM if bestPWM == None: bestPWM = [[0, 0, 0, 0] for x in xrange(MLEN)] ress.append(bestPWM) for c, col in zip(curr, bestPWM): col[ALPH[c]] += 1 insts = sum(bestPWM[0]) if insts > maxInsts: maxInsts = insts votedRess = DD(int) for PWM in ress: l = len(PWM) cons = PWMconsensus(PWM) for seq in sequences: for spos in xrange(0, len(sequences[seq]) - l): # .75% thresh if compMotifPWM(sequences[seq][spos:spos + l], PWM) >= .75 * l: for pos in xrange(spos, spos + l): votedRess[cons] += poll[seq][pos] return sorted(votedRess.iteritems(), key=lambda a: a[::-1])