Example #1
0
def distanceMatrixCorrelation(matrix1, matrix2, weights = None,
                              collectComponents = False):
    """
    :param matrix1:
    :param matrix2:
    :param weights:
    :return: mean, and STD of the Kendal Tau Distances between all rows,
    and sorted list of names in the order of better correlations
    """

    size = matrix1.getSize()
    assert(size == matrix2.getSize())
    assert((not weights) or (size == weights.getSize()))
    kendallList = [None] * size
    weightsAllOnes = [1.0] * size
    compDict = DefDict(list)
    compSet = set()
    if collectComponents:
        for vl in matrix1.getArray():
            for v in vl:
                compSet.add(v)
    for i in range(size):
        components = DefDict(float)
        kendallList[i] = calculateWeightedKendall(matrix1[i],
            matrix2[i], weights = weights[i] if weights else None,
            components = components if collectComponents else None)
        for k in compSet:
            compDict[k].append(components[k])
    sortedNames = sorted(zip(matrix1.names, kendallList), key =
        operator.itemgetter(1))
    compList = None
    if collectComponents:
        compList = map(np.mean, map(operator.itemgetter(1),
            sorted(compDict.iteritems(), key = operator.itemgetter(0))))
    return (np.mean(kendallList), np.std(kendallList), sortedNames, compList)
Example #2
0
def voteRank(sequences, motifs):
    poll = {}
    for seq in sequences:
        poll[seq] = [0.0] * len(sequences[seq])
    
    # perform poll
    for tool in motifs:
        for motif in motifs[tool]:
            for seq in motifs[tool][motif]:
                sequence = best(sequences, seq)
                for pos in motifs[tool][motif][seq]:
                    for i in xrange(pos, pos + len(motif)):
                        try:
                            # instead of weighting all results the same (1), we
                            # could bias based on tool or number of results or something like that
                            poll[sequence][i - 1] += 1
                        except Exception as e:
                            print e
                            print 'It appears a tool has reported finding a motif',\
                                'outside the bounds of a sequence'
                            print 'such as finding a motif of length 10 at position',\
                                '195 in a sequence with length 200'
                            pdb.set_trace()
    # add up votes for each motif
    ress = DD(int)
    for tool in motifs:
        for motif in motifs[tool]:
            for seq in motifs[tool][motif]:
                for pos in motifs[tool][motif][seq]:
                    for p in xrange(pos, pos + len(motif)):
                        ress[motif] += poll[best(sequences, seq)][p-1]
    # sort motifs by number of votes
    return sorted(map(lambda a: list(a[::-1]), ress.iteritems()))
Example #3
0
        OUT1.write("%s\t%0.1f\t%0.1f\t%s\t%s\n" %
                   (transpos, RefMet[transpos], met, "nada", "nada"))
        # OUT1.write("%s\t%s\t%0.1f\t%s\t%s\t%s\n" % (transpos, RefMet[transpos], met, x[4], x[5], RefSeq[transpos]) )
        FoundMet[transpos] = met
        found += 1
    else:
        # Option for dumping all other 5mC calls . . . . . .
        met = 100 * (x[1] / x[2])
        # headers = "Pos\tExpMet\tObsMet\tMETscore\tUMTscore\tSeq\n"
        OUT2.write("%s\t%0.1f\t%0.1f\t%s\t%s\tnada\n" %
                   (transpos, RefMet[transpos], met, "nada", "nada"))

IN.close()
OUT1.close()
OUT2.close()

OUT3 = open(results + OUTlost, 'w')
OUT3.write("POS\tpMET\tLOST\n")
lost = 0
for (pos, count) in RefMet.iteritems():
    if count > 0:
        if FoundMet[pos] < 0:
            OUT3.write("%s\t%s\t0\n" % (pos, RefMet[pos]))
            lost += 1

OUT3.close()
print "    FOUND = ", found, " ;  LOST = ", lost
print "\n\n\n * * * * *   D O N E   * * * * * * \n\n\n"

# EOF ------------------------------------------------------------------------
cogDist = DefDict(dict)
for ordinal, (dir1, cs1) in enumerate(cogDict.iteritems(), start = 1):
    print("\r%d. %s" % (ordinal, dir1)),
    for dir2, cs2 in cogDict.iteritems():
        cogDist[dir1][dir2] = cogDistFunc(cs1, cs2)

print("\nBuilding average distances for TaxaTypes...")
# Genome dir -> dict of {taxaTypes -> avg COG distance to dir}
dirTaxaTypeDictDict = DefDict(lambda: DefDict(list))
for ordinal, dir1 in enumerate(taxaDict.keys(), start = 1):
    print("\r%d. %s" % (ordinal, dir1)),
    for dir2, taxa in taxaDict.iteritems():
        dirTaxaTypeDictDict[dir1][repr(taxa.type)].append(cogDist[dir1][dir2])

print("\nRebuilding dirTaxaTypeDictDict to get UtilNormDistribs...")
for dir, d in dirTaxaTypeDictDict.iteritems():
    # Find global weighted STD
    std = 0.
    totalLen = 0
    for taxaTypeStr, distList in d.iteritems():
        if len(distList) >= 2:
            val = np.std(distList, ddof = 1.)
            std += val * val * len(distList)
            totalLen += len(distList)
    if totalLen == 0:
        raise ValueError("Cannot calcuate global std for %s" % dir)
    std /= totalLen

    for taxaTypeStr, distList in d.iteritems():
        localStd = np.std(distList)
        localStd *= localStd
Example #5
0
        FILE[i] = FILE[i].rstrip()
    fragSeq = ''.join(FILE)
    fracGC = fragSeq.count('C') + fragSeq.count('G')
    genFragLen = len(fragSeq)

    # Generate refMettable to work with. . . . . . . .
    index = 0
    while (index > -1):
        index = fragSeq.find('CCGG', index + 3)
        if index > -1 and index < genFragLen - 10:
            METtable[index] = random.choice(methylstates)
        else:
            break

    Mettable = open(refMetTable, 'w')
    for pos, pcnt in METtable.iteritems():
        Mettable.write("%s\t%0.2f\n" % (pos + 1, pcnt))
    Mettable.close()

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# . . . Process differentially methylated copies . . . .
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

if (ReadGenFile == 0):
    # Assign target quantitative state to each CCGG site in the sample . . . .
    fracGC = 100 * float(fracGC) / float(genFragLen)
    countCCGG = 0
    index = 7
    while (index > -1):
        index = fragSeq.find('CCGG', index + 3)
        if index > -1 and index < genFragLen - 10:
Example #6
0
         count += 1
         
         if count % 1000 == 0:
             print count,
             
     print
     
     print k,"Done!!"
     
 ## Print Dict
 
 print "Printing Dictionary"
 
 
 
 for i,j in contextDict.iteritems():
     frequenciesFile.write(i[0]+"\t"+i[1]+"\t"+str(j)+"\n")
 
 frequenciesFile.close()
 
 ## Print Test, Indices
 
 print "printing PureTrainFile"
 
 for i in trainIndices:
     pureTrainFile.write(word_tag(enLines[i]))
     pureTrainFile.write(word_tag(frLines[i]))
 pureTrainFile.close()
 
 print "Printing Testfile"
 
Example #7
0
def voteRefine(sequences, motifs):
    #get probabilities
    lets = "ACGT"
    probability = DD(int)
    for seq in sequences:
        for let in sequences[seq]:
            probability[let] += 1
    s = sum(probability.values())
    for let in lets:
        probability[let] = probability[let] / float(s)

    #conductPoll
    poll = {}
    maxV = 0
    maxL = 0
    for seq in sequences:
        poll[seq] = [0.0] * len(sequences[seq])
        if len(sequences[seq]) > maxL:
            maxL = len(sequences[seq])

    for tool in motifs:
        for motif in motifs[tool]:
            for seq in motifs[tool][motif]:
                sequence = best(sequences, seq)
                for pos in motifs[tool][motif][seq]:
                    for i in xrange(pos, pos + len(motif)):
                        try:
                            # instead of weighting all results the same (1), we
                            # could bias based on tool or number of results or something like that
                            #poll[sequence][i - 1] += 1
                            if tool == "CMF":
                                poll[sequence][i - 1] += 1
                            if tool == "Weeder":
                                poll[sequence][i - 1] += 1
                            if tool == "MEME":
                                poll[sequence][i - 1] += 1
                            if tool == "DECOD":
                                poll[sequence][i - 1] += 1
                            if tool == "BioProspector":
                                poll[sequence][i - 1] += 1
                            if tool == "XXmotif":
                                poll[sequence][i - 1] += 1

                        except Exception as e:
                            print e
                            print 'It appears a tool has reported finding a motif',\
                                'outside the bounds of a sequence'
                            print 'such as finding a motif of length 10 at position',\
                                '195 in a sequence with length 200'
                            pdb.set_trace()

                        if poll[sequence][i - 1] > maxV:
                            maxV = poll[sequence][i - 1]
    #inspectPoll

    ress = []
    THRESH = 3.7
    maxInsts = 0

    MLEN = MOTIF_LEN
    for seq in poll:
        for i in xrange(len(poll[seq]) - MLEN):
            if sum(poll[seq][i:i + MLEN]) >= MLEN * THRESH:
                curr = sequences[seq][i:i + MLEN]
                bestPWM = None
                bestMatching = 0
                for PWM in ress:
                    matching = compMotifPWM(curr, PWM)
                    if matching > bestMatching and matching > MLEN / 2:
                        bestMatching = matching
                        bestPWM = PWM
                if bestPWM == None:
                    bestPWM = [[0, 0, 0, 0] for x in xrange(MLEN)]
                    ress.append(bestPWM)
                for c, col in zip(curr, bestPWM):
                    col[ALPH[c]] += 1
                insts = sum(bestPWM[0])
                if insts > maxInsts:
                    maxInsts = insts

    votedRess = DD(int)
    for PWM in ress:
        l = len(PWM)
        cons = PWMconsensus(PWM)
        for seq in sequences:
            for spos in xrange(0, len(sequences[seq]) - l):
                # .75% thresh
                if compMotifPWM(sequences[seq][spos:spos + l], PWM) >= .75 * l:
                    for pos in xrange(spos, spos + l):
                        votedRess[cons] += poll[seq][pos]

    return sorted(votedRess.iteritems(), key=lambda a: a[::-1])