def reduceLocusMap(geneTree, locusMapD): '''Create a new locus map D with only entries for genes in geneTree.''' gtLocusMapD = {} for leaf in trees.leafList(geneTree): # the leaf is a gene number gtLocusMapD[leaf] = locusMapD[leaf] return gtLocusMapD
def coreNonCoreCtAtNode(tree, node, familyByNodeL, familyL): '''Given a tree and a node, first get all the families present in descendant species. Then figure out which of these families are non-core (their mrca is located below node) and which are core (mrca is at node or above). Return count of non-core and core.''' subtree = trees.subtree(tree, node) nonCoreMrcaL = trees.nodeList(subtree[1]) + trees.nodeList(subtree[2]) # get set of all families with members in descendant species of this node decFamS = set() for leaf in trees.leafList(subtree): decFamS.update(familyByNodeL[leaf]) # figure out which are core, non-core coreCt = 0 nonCoreCt = 0 totCt = len(decFamS) for fam in decFamS: if familyL[fam].mrca in nonCoreMrcaL: nonCoreCt += 1 else: coreCt += 1 return nonCoreCt, coreCt
def coreNonCoreCtAtNode(tree,node,familyByNodeL,familyL): '''Given a tree and a node, first get all the families present in descendant species. Then figure out which of these families are non-core (their mrca is located below node) and which are core (mrca is at node or above). Return count of non-core and core.''' subtree = trees.subtree(tree,node) nonCoreMrcaL = trees.nodeList(subtree[1]) + trees.nodeList(subtree[2]) # get set of all families with members in descendant species of this node decFamS = set() for leaf in trees.leafList(subtree): decFamS.update(familyByNodeL[leaf]) # figure out which are core, non-core coreCt=0 nonCoreCt=0 totCt = len(decFamS) for fam in decFamS: if familyL[fam].mrca in nonCoreMrcaL: nonCoreCt += 1 else: coreCt += 1 return nonCoreCt,coreCt
def calcNormScores(tree, strainNum2StrD, blastFilePath, evalueThresh, scoresO, geneNames, aabrhFN): '''Given directory of blast output and a graph of raw similarity scores, calculate normalized similarity scores by comparing each score with the range of scores in in all around best reciprocal hits in that pair of strains.''' strainNamesL = sorted( [strainNum2StrD[leaf] for leaf in trees.leafList(tree)]) aabrhL = createAabrhL(blastFilePath, strainNamesL, evalueThresh, aabrhFN) aabrhRawScoreSummmaryD = getAabrhRawScoreSummmaryD(strainNamesL, aabrhL, scoresO, geneNames) # loop over each edge in scoresO, normalizing score and saving there for gn1, gn2 in scoresO.iterateEdgesByEndNodes(): rawSc = scoresO.getScoreByEndNodes(gn1, gn2, 'rawSc') # find mean,std from aabrhRawScoreSummmaryD. gnName1 = geneNames.numToName(gn1) sp1, restOfName1 = gnName1.split('-') gnName2 = geneNames.numToName(gn2) sp2, restOfName1 = gnName2.split('-') mean, std = aabrhRawScoreSummmaryD[(sp1, sp2)] normSc = normScore(rawSc, mean, std) scoresO.addScoreByEndNodes(gn1, gn2, normSc, 'normSc') return scoresO, aabrhL, aabrhRawScoreSummmaryD
def calcNormScores(tree,strainNum2StrD,blastFilePath,evalueThresh,scoresO,geneNames,aabrhFN): '''Given directory of blast output and a graph of raw similarity scores, calculate normalized similarity scores by comparing each score with the range of scores in in all around best reciprocal hits in that pair of strains.''' strainNamesL=sorted([strainNum2StrD[leaf] for leaf in trees.leafList(tree)]) aabrhL = createAabrhL(blastFilePath,strainNamesL,evalueThresh,aabrhFN) aabrhRawScoreSummmaryD=getAabrhRawScoreSummmaryD(strainNamesL,aabrhL,scoresO,geneNames) # loop over each edge in scoresO, normalizing score and saving there for gn1,gn2 in scoresO.iterateEdgesByEndNodes(): rawSc=scoresO.getScoreByEndNodes(gn1,gn2,'rawSc') # find mean,std from aabrhRawScoreSummmaryD. gnName1 = geneNames.numToName(gn1) sp1,restOfName1 = gnName1.split('-') gnName2 = geneNames.numToName(gn2) sp2,restOfName1 = gnName2.split('-') mean,std = aabrhRawScoreSummmaryD[(sp1,sp2)] normSc = normScore(rawSc,mean,std) scoresO.addScoreByEndNodes(gn1,gn2,normSc,'normSc') return scoresO,aabrhL,aabrhRawScoreSummmaryD
def createLRSets(tree, geneNames): '''For every gene in our data, put it into one of three sets. Left, right, or outgroup. Genes in the left set are found in a species on the left branch of tree.''' leftSpeciesS = set(trees.leafList(tree[1])) rightSpeciesS = set(trees.leafList(tree[2])) leftS = set() rightS = set() outgroupS = set() for geneNum in geneNames.nums: # all genes strain = geneNames.numToStrainNum(geneNum) if strain in leftSpeciesS: leftS.add(geneNum) elif strain in rightSpeciesS: rightS.add(geneNum) else: outgroupS.add(geneNum) return (leftS, rightS, outgroupS)
def createLRSets(tree,geneNames): '''For every gene in our data, put it into one of three sets. Left, right, or outgroup. Genes in the left set are found in a species on the left branch of tree.''' leftSpeciesS=set(trees.leafList(tree[1])) rightSpeciesS=set(trees.leafList(tree[2])) leftS=set() rightS=set() outgroupS=set() for geneNum in geneNames.nums: # all genes strain=geneNames.numToStrainNum(geneNum) if strain in leftSpeciesS: leftS.add(geneNum) elif strain in rightSpeciesS: rightS.add(geneNum) else: outgroupS.add(geneNum) return(leftS,rightS,outgroupS)
def printIslandNeighb(islandNum, synWSize, subtreeL, islandByNodeL, familyL, geneOrderT, gene2FamD, fam2IslandD, geneInfoD, geneNames, strainNum2StrD, fileF): '''Print the neighborhood of an island. We include the genes in the island and synWSize/2 genes in either direction.''' print(" Island:", islandNum, file=fileF) genesInEitherDirec = int(synWSize / 2) # get the island object for this islandNum for listOfIslands in islandByNodeL: _, island = islands.searchIslandsByID(listOfIslands, islandNum) if island != None: break if island == None: raise ValueError("Island " + str(islandNum) + " not found.") mrca = island.mrca print(" mrca:", strainNum2StrD[mrca], file=fileF) leavesL = trees.leafList(subtreeL[mrca]) for strainNum in leavesL: print(" In", strainNum2StrD[strainNum], end=' ', file=fileF) islandGenesInStrainL = getIslandGenesInStrain(island, strainNum, familyL) if islandGenesInStrainL == []: print("the island is not found.", file=fileF) else: neighbGenesL, firstIslandGene, lastIslandGene = getNeighborhoodGenes( strainNum, geneOrderT, islandGenesInStrainL, genesInEitherDirec) # print coordinates of island in this strain chrom = geneInfoD[geneNames.numToName(islandGenesInStrainL[0])][3] startPos = geneInfoD[geneNames.numToName(firstIslandGene)][4] endPos = geneInfoD[geneNames.numToName(lastIslandGene)][5] print("(Coordinates", chrom + ":" + str(startPos) + "-" + str(endPos) + ")", file=fileF) printGenes(neighbGenesL, geneNames, gene2FamD, fam2IslandD, geneInfoD, islandGenesInStrainL, familyL, strainNum2StrD, fileF)
def familyPrintStrainsPresentAbsent(tree,strainNum2StrD,familyL,famNum,fileF=sys.stdout): '''Print a list of strains where the family is present, and another where it is absent.''' presL=[] notPresL=[] for leafNum in trees.leafList(tree): if familyL[famNum].isInStrain(leafNum): presL.append(strainNum2StrD[leafNum]) else: notPresL.append(strainNum2StrD[leafNum]) print("Family:",famNum,file=fileF) print(" Strains possessing:",file=fileF) for strain in presL: print(" "+strain,file=fileF) print(file=fileF) print(" Strains lacking:",file=fileF) for strain in notPresL: print(" "+strain,file=fileF)
def vPrintIsland(island,subtreeL,familyL,strainNum2StrD,geneNames,fileF): '''Verbose print of an island.''' print(" Island",island.id,file=fileF) # get species nodes subtended by this mrca speciesNodesL=trees.leafList(subtreeL[island.mrca]) # put everything in lists. printL=[] printL.append(['Family']) for node in speciesNodesL: printL[0].append(strainNum2StrD[node]) for fam in island.familyL: newRow=[] newRow.append(str(fam)) for node in speciesNodesL: geneT = familyL[fam].famGeneT[node] newRow.append(",".join([geneNames.numToName(geneNum) for geneNum in geneT])) printL.append(newRow) printTable(printL,indent=4,fileF=fileF)
def printIslandNeighb(islandNum,synWSize,subtreeL,islandByNodeL,familyL,geneOrderT,gene2FamD,fam2IslandD,geneInfoD,geneNames,strainNum2StrD,fileF): '''Print the neighborhood of an island. We include the genes in the island and synWSize/2 genes in either direction.''' print(" Island:",islandNum,file=fileF) genesInEitherDirec = int(synWSize/2) # get the island object for this islandNum for listOfIslands in islandByNodeL: _,island = islands.searchIslandsByID(listOfIslands,islandNum) if island != None: break if island == None: raise ValueError("Island "+str(islandNum)+" not found.") mrca = island.mrca print(" mrca:",strainNum2StrD[mrca],file=fileF) leavesL=trees.leafList(subtreeL[mrca]) for strainNum in leavesL: print(" In",strainNum2StrD[strainNum],end=' ',file=fileF) islandGenesInStrainL = getIslandGenesInStrain(island,strainNum,familyL) if islandGenesInStrainL == []: print("the island is not found.",file=fileF) else: neighbGenesL,firstIslandGene,lastIslandGene=getNeighborhoodGenes(strainNum,geneOrderT,islandGenesInStrainL,genesInEitherDirec) # print coordinates of island in this strain chrom=geneInfoD[geneNames.numToName(islandGenesInStrainL[0])][3] startPos = geneInfoD[geneNames.numToName(firstIslandGene)][4] endPos = geneInfoD[geneNames.numToName(lastIslandGene)][5] print("(Coordinates",chrom+":"+str(startPos)+"-"+str(endPos)+")",file=fileF) printGenes(neighbGenesL,geneNames,gene2FamD,fam2IslandD,geneInfoD,islandGenesInStrainL,familyL,strainNum2StrD,fileF)
def vPrintIsland(island, subtreeL, familyL, strainNum2StrD, geneNames, fileF): '''Verbose print of an island.''' print(" Island", island.id, file=fileF) # get species nodes subtended by this mrca speciesNodesL = trees.leafList(subtreeL[island.mrca]) # put everything in lists. printL = [] printL.append(['Family']) for node in speciesNodesL: printL[0].append(strainNum2StrD[node]) for fam in island.familyL: newRow = [] newRow.append(str(fam)) for node in speciesNodesL: geneT = familyL[fam].famGeneT[node] newRow.append(",".join( [geneNames.numToName(geneNum) for geneNum in geneT])) printL.append(newRow) printTable(printL, indent=4, fileF=fileF)
def familyPrintStrainsPresentAbsent(tree, strainNum2StrD, familyL, famNum, fileF=sys.stdout): '''Print a list of strains where the family is present, and another where it is absent.''' presL = [] notPresL = [] for leafNum in trees.leafList(tree): if familyL[famNum].isInStrain(leafNum): presL.append(strainNum2StrD[leafNum]) else: notPresL.append(strainNum2StrD[leafNum]) print("Family:", famNum, file=fileF) print(" Strains possessing:", file=fileF) for strain in presL: print(" " + strain, file=fileF) print(file=fileF) print(" Strains lacking:", file=fileF) for strain in notPresL: print(" " + strain, file=fileF)
def printIslandNeighb(islandNum, synWSize, subtreeL, islandByNodeL, familyL, geneOrderT, gene2FamD, fam2IslandD, geneInfoD, geneNames, strainNum2StrD, fileF): '''Print the neighborhood of an island. We include the genes in the island and synWSize/2 genes in either direction.''' print(" Island:", islandNum, file=fileF) genesInEitherDirec = int(synWSize / 2) # get the island object for this islandNum for listOfIslands in islandByNodeL: _, island = islands.searchIslandsByID(listOfIslands, islandNum) if island != None: break mrca = island.mrca print(" mrca:", strainNum2StrD[mrca], file=fileF) leavesL = trees.leafList(subtreeL[mrca]) for strainNum in leavesL: print(" In", strainNum2StrD[strainNum], end=' ', file=fileF) islandGenesInStrainL = getIslandGenesInStrain(island, strainNum, familyL) if islandGenesInStrainL == []: print("the island is not found.", file=fileF) else: neighbGenesL, firstIslandGene, lastIslandGene = getNeighborhoodGenes( strainNum, geneOrderT, islandGenesInStrainL, genesInEitherDirec) # print coordinates of island in this strain chrom = geneInfoD[geneNames.numToName(islandGenesInStrainL[0])][3] startPos = geneInfoD[geneNames.numToName(firstIslandGene)][4] endPos = geneInfoD[geneNames.numToName(lastIslandGene)][5] print("(Coordinates", chrom + ":" + str(startPos) + "-" + str(endPos) + ")", file=fileF) # now print the neighbors rowsL = [] for tempGene in neighbGenesL: tempGeneName = geneNames.numToName(tempGene) tempFamNum = gene2FamD[tempGene] tempGeneIsland = fam2IslandD[tempFamNum] if tempGeneName in geneInfoD: descrip = geneInfoD[tempGeneName][2] else: descrip = '' # mark genes in the island with a * if tempGene in islandGenesInStrainL: tempGeneName = '* ' + tempGeneName else: tempGeneName = ' ' + tempGeneName infoL = [ tempGeneName, "isl:" + str(tempGeneIsland.id), "fam:" + str(tempFamNum), "errSc:" + str(familyL[tempFamNum].possibleErrorCt), "mrca:" + strainNum2StrD[tempGeneIsland.mrca], descrip ] rowsL.append(infoL) printTable(rowsL, indent=4, fileF=fileF)
## main #TODO: proper CLI if __name__ == "__main__": speciesTreeFN = sys.argv[1] geneTreeFN = sys.argv[2] # load stuff speciesTree = trees.readTree(speciesTreeFN) geneTree = trees.loadOneGeneTree(geneTreeFN) bigTipMapD = loadD("tipMap.tsv") tipMapD = {} # cut down to those in this gene tree for leaf in trees.leafList(geneTree): tipMapD[leaf] = bigTipMapD[leaf] locusMapD = loadD("locusMap.tsv") gtLocusMapD = familiesDTLORstuff.reduceLocusMap(geneTree, locusMapD) locusMapForRootingD = trees.createLocusMapForRootingD( geneTree, copy.deepcopy(gtLocusMapD)) argT = (speciesTree, geneTree, tipMapD, gtLocusMapD, locusMapForRootingD, D, T, L, O, R) optRootedGeneTree, optMPR = familiesDTLORstuff.reconcile(argT) print("Rooted tree:") print(optRootedGeneTree) print()
such a list. Its not intended to be used every time we run, but rather to make the list, which can then be put inside a parameter file. Doubtless there's a better way to make this list. ''' L=[] for i in range(0,int(stride/offset)): st=mn+offset*i end=mx for j in range(st,end,stride): L.append(j) return L if __name__ == "__main__": paramFN=sys.argv[1] paramD = parameters.loadParametersD(paramFN) ## load data structures we'll use below tree,strainStr2NumD,strainNum2StrD = trees.readTree(paramD['treeFN']) leafNodesL = trees.leafList(tree) geneNames = genomes.geneNames(paramD['geneOrderFN'],strainStr2NumD,strainNum2StrD) familyL = families.readFamilies(paramD['familyFN'],tree,geneNames,strainStr2NumD) islandByNodeL=islands.readIslands(paramD['islandOutFN'],tree,strainStr2NumD) geneInfoD = genomes.readGeneInfoD(paramD['geneInfoFN']) # get islands organized by strain islandByStrainD = createIslandByStrainD(leafNodesL,strainNum2StrD,islandByNodeL,familyL,geneNames,geneInfoD) createAllGffs(islandByStrainD,geneInfoD,tree,strainNum2StrD,paramD['gffFilePath'],paramD['scoreNodeMapD'],paramD['potentialScoresL'])
paramFN = sys.argv[1] paramD = parameters.loadParametersD(paramFN) tree, strainStr2NumD, strainNum2StrD = trees.readTree(paramD['treeFN']) # get familyL etc. geneNames = genomes.geneNames(paramD['geneOrderFN'], strainStr2NumD, strainNum2StrD) # scores scoresO = scores.readScores(paramD['scoresFN'], geneNames) aabrhL = scores.loadOrthos(paramD['aabrhFN']) strainNamesL = sorted( [strainNum2StrD[leaf] for leaf in trees.leafList(tree)]) aabrhRawScoreSummmaryD = scores.getAabrhRawScoreSummmaryD( strainNamesL, aabrhL, scoresO, geneNames) print( "Mean and standard deviation of raw scores between aabrh orthologs for pairs of species." ) rowL = [] rowL.append(['Species 1', 'Species 2', 'Mean', 'Standard dev']) rowL.append(['---------', '---------', '----', '------------']) for keyT, valT in aabrhRawScoreSummmaryD.items(): row = [] row.extend(keyT) row.append(format(valT[0], '.3f')) row.append(format(valT[1], '.3f'))