Ejemplo n.º 1
0
def get_rate_correlation_windowed(dns, dss, aligned_prots, cdna_dicts,
                                  spec_orf_list, xfold_only, xfold_degeneracy):
    window_size = 1
    ns = []
    ss = []
    # Examine only codons of a certain degeneracy?
    xfold_ending = 'CTAG'
    xfold_wrong_aas = ''
    aligned_cdnas = []
    for xi in range(len(spec_orf_list)):
        (spec, orf) = spec_orf_list[xi]
        try:
            aligned_cdnas.append(
                muscle.align_gene_from_protein(cdna_dicts[spec][orf],
                                               aligned_prots[xi]))
        except KeyError:
            aligned_cdnas.append(
                muscle.align_gene_from_protein(cdna_dicts[spec + '-mit'][orf],
                                               aligned_prots[xi]))
        assert len(aligned_prots[xi]) == len(dns)

    for site in range(0, len(dns), window_size):
        if xfold_only and (window_size == 1):
            codons = [
                aligned_cdna[3 * site:3 * site + 3]
                for aligned_cdna in aligned_cdnas
            ]
            wrong_degeneracy = False
            wrong_ending = False
            wrong_aa = False
            degs = [codon_degeneracy[codon] for codon in codons]
            for codon in codons:
                if codon_degeneracy[codon] != xfold_degeneracy:
                    wrong_degeneracy = True
                if not codon[2] in xfold_ending:
                    wrong_ending = True
                if not codon == '---' and translate._genetic_code[
                        codon] in xfold_wrong_aas:
                    wrong_aa = True
            if wrong_degeneracy or wrong_ending or wrong_aa:
                continue
        # Add up substitutions in window
        syn = 0
        nsyn = 0
        valid_sites = False
        for nsite in range(site, min(len(dns), site + window_size)):
            (s, n) = (dss[nsite], dns[nsite])
            # don't consider cases with missing counts
            if not (s is None) and not (n is None):
                syn += s
                nsyn += n
                valid_sites = True
        if valid_sites:
            ns.append(nsyn)
            ss.append(syn)
    return stats.PearsonCorrelation(ns, ss), ns, ss
Ejemplo n.º 2
0
def get_rate_correlation_windowed(dns, dss, aligned_prots, cdna_dicts, spec_orf_list, xfold_only, xfold_degeneracy):
	window_size = 1
	ns = []
	ss = []
	# Examine only codons of a certain degeneracy?
	xfold_ending = 'CTAG'
	xfold_wrong_aas = ''
	aligned_cdnas = []
	for xi in range(len(spec_orf_list)):
		(spec,orf) = spec_orf_list[xi]
		try:
			aligned_cdnas.append(muscle.align_gene_from_protein(cdna_dicts[spec][orf], aligned_prots[xi]))
		except KeyError:
			aligned_cdnas.append(muscle.align_gene_from_protein(cdna_dicts[spec+'-mit'][orf], aligned_prots[xi]))
		assert len(aligned_prots[xi]) == len(dns)		

	for site in range(0,len(dns),window_size):
		if xfold_only and (window_size == 1):
			codons = [aligned_cdna[3*site:3*site+3] for aligned_cdna in aligned_cdnas]
			wrong_degeneracy = False
			wrong_ending = False
			wrong_aa = False
			degs = [codon_degeneracy[codon] for codon in codons]
			for codon in codons:
				if codon_degeneracy[codon] != xfold_degeneracy:
					wrong_degeneracy = True
				if not codon[2] in xfold_ending:
					wrong_ending = True
				if not codon=='---' and translate._genetic_code[codon] in xfold_wrong_aas:
					wrong_aa = True
			if wrong_degeneracy or wrong_ending or wrong_aa:
				continue
		# Add up substitutions in window
		syn = 0
		nsyn = 0
		valid_sites = False
		for nsite in range(site, min(len(dns),site+window_size)):
			(s,n) = (dss[nsite], dns[nsite])
			# don't consider cases with missing counts
			if not (s is None) and not (n is None):
				syn += s
				nsyn += n
				valid_sites = True
		if valid_sites:
			ns.append(nsyn)
			ss.append(syn)
	return stats.PearsonCorrelation(ns,ss), ns, ss
Ejemplo n.º 3
0
def getShortestDistanceHits(hits, queryGeneDict, targetGeneDict, queryProtDict, targetProtDict, distanceCache, alignmentCache):
	queryToTargetSDHits = {}
	totalHits = len(hits.keys())
	nHits = 0
	for (queryGene, qHits) in hits.items():
		minDist = 100.0
		minHitGene = None
		for targetGene in qHits:
			key = cacheKey(queryGene, targetGene)
			dNML = 0.0
			try:
				(dDML, dSML, dNML, dDNG, dSNG, dNNG, numSynonymousSites, numNonsynonymousSites, fracAligned, seqIdentity) = distanceCache[key]
			except KeyError:
				queryGeneSeq = queryGeneDict[queryGene]
				targetGeneSeq = targetGeneDict[targetGene]
				[alignedQueryProt, alignedTargetProt] = muscle.align_sequences([queryProtDict[queryGene], targetProtDict[targetGene]])
				alignedQueryGene = muscle.align_gene_from_protein(queryGeneSeq, alignedQueryProt)
				alignedTargetGene = muscle.align_gene_from_protein(targetGeneSeq, alignedTargetProt)
				(dDML, dSML, dNML, dDNG, dSNG, dNNG, numSynonymousSites, numNonsynonymousSites) = my_paml.Get_Distance_NS(alignedQueryGene, alignedTargetGene, 'codon')
				(seqIdentity, numIdentical, numAligned) = sequenceIdentity(alignedQueryProt, alignedTargetProt)
				fracAligned = numAligned/float(len(queryProtDict[queryGene]))
				if fracAligned == 1.0 and seqIdentity == 1.0:
					dNML = 0.0 # Obviously no nonsyn. changes if proteins are identical
				elif 1.0/numNonsynonymousSites > dNML:
					dNML = 1.0/numNonsynonymousSites # Minimum possible change
				(ntseqIdentity, ntnumIdentical, ntnumAligned) = sequenceIdentity(alignedQueryGene, alignedTargetGene)
				ntfracAligned = ntnumAligned/float(len(queryGeneSeq))
				if ntfracAligned == 1.0 and ntseqIdentity == 1.0:
					dSML = 0.0 # Obviously no syn. changes if genes are identical
				elif 1.0/numSynonymousSites > dSML:
					dSML = 1.0/numSynonymousSites # Minimum possible change
				# Cache this distance and alignment
				distanceCache[key] = (dDML, dSML, dNML, dDNG, dSNG, dNNG, numSynonymousSites, numNonsynonymousSites, fracAligned, seqIdentity)
				alignmentCache[key] = ((queryGene, alignedQueryGene, alignedQueryProt), (targetGene, alignedTargetGene, alignedTargetProt))
			# Check to see if this is a shorter distance
			if dNML < minDist and targetGene != queryGene:
				minDist = dNML
				minHitGene = targetGene
		nHits += 1
		(dDML, dSML, dNML, dDNG, dSNG, dNNG, numSynonymousSites, numNonsynonymousSites, fracAligned, seqIdentity) = distanceCache[cacheKey(queryGene, minHitGene)]
		print "# %d of %d: %s was %s, dN = %1.6f, fracAlign = %1.6f" % (nHits, totalHits, queryGene, minHitGene, minDist, fracAligned)
		queryToTargetSDHits[queryGene] = minHitGene
	return queryToTargetSDHits
Ejemplo n.º 4
0
            xprot for ((xgenome, xorf), xprot) in zip(corr_keys, aligned_prots)
            if xgenome in tree_species
        ]
        all_genes = []
        for (xgenome, xorf) in corr_keys:
            if xgenome in tree_species:
                try:
                    seq = cdna_dicts[xgenome][xorf]
                except KeyError:
                    seq = cdna_dicts[xgenome + '-mit'][xorf]
                all_genes.append(seq)
        all_species = tree_species  #[xgenome for (xgenome, xorf) in corr_keys]
        assert len(all_genes) == len(all_prots)
        assert len(all_species) == len(all_genes)
        all_aligned_genes = [
            muscle.align_gene_from_protein(xgene, xprot)
            for (xgene, xprot) in zip(all_genes, all_prots)
        ]

        # Put the genes in the right order
        sub_gene_dict = dict(zip(all_species, all_aligned_genes))
        recon_gene_list = [sub_gene_dict[spec] for spec in tree_species]

        try:
            #(rates,ancestor) = (1,2)
            (dns, dss) = my_paml.Get_dNdS_Per_Codon(recon_gene_list,
                                                    tree_species, tree, 100)
            #(rates, ancestor) = my_paml.Get_Site_Rates_And_Ancestor(recon_gene_list, tree_species, tree)
            (r, p, n) = stats.PearsonCorrelation(dns, dss)
            print "# %d %s %1.3f" % (n_genes, gene, r)
        except my_paml.PAMLError, pe:
Ejemplo n.º 5
0
			print "# rejected orf (mfal,msid) %s (%1.2f,%1.2f)" % (gene, mfal, msid)
			continue

		all_prots = [xprot for ((xgenome,xorf), xprot) in zip(corr_keys,aligned_prots) if xgenome in tree_species]
		all_genes = []
		for (xgenome, xorf) in corr_keys:
			if xgenome in tree_species:
				try:
					seq = cdna_dicts[xgenome][xorf]
				except KeyError:
					seq = cdna_dicts[xgenome+'-mit'][xorf]
				all_genes.append(seq)
		all_species = tree_species #[xgenome for (xgenome, xorf) in corr_keys]
		assert len(all_genes) == len(all_prots)
		assert len(all_species) == len(all_genes)
		all_aligned_genes = [muscle.align_gene_from_protein(xgene,xprot) for (xgene,xprot) in zip(all_genes,all_prots)]

		# Put the genes in the right order
		sub_gene_dict = dict(zip(all_species, all_aligned_genes))
		recon_gene_list = [sub_gene_dict[spec] for spec in tree_species]

		try:
			#(rates,ancestor) = (1,2)
			(dns, dss) = my_paml.Get_dNdS_Per_Codon(recon_gene_list, tree_species, tree, 100)
			#(rates, ancestor) = my_paml.Get_Site_Rates_And_Ancestor(recon_gene_list, tree_species, tree)
			(r, p, n) = stats.PearsonCorrelation(dns, dss)
			print "# %d %s %1.3f" % (n_genes, gene, r)
		except my_paml.PAMLError, pe:
			print "#",pe
			continue
		#print len(rates), len(ancestor)