def reconcile(fileName, D, T, L): """Takes as input a newick file, FileName, a dupliction cost, a transfer cost, and a loss cost. This uses newickFormatReader to extract the host tree, parasite tree and tip mapping from the file and then calls DP to return the DTL reconciliation graph of the provided newick file""" host, paras, phi = newickFormatReader.getInput(fileName) return DP(host, paras, phi, D, T, L)
def main(): args = process_arg() newick_data = getInput(args.filename) if args.functionality == "costscape": costscape.solve(newick_data, args.tl, args.th, args.ll, args.lh, args) elif args.functionality == "reconcile": DTLReconGraph.reconcile_noninter(newick_data, args.d, args.t, args.l) elif args.functionality == "histogram": HistogramMain.compute_pdv(args.filename, newick_data, args.d, args.t, args.l, args) elif args.functionality == "clumpr": ClusterMain.perform_clustering(newick_data, args.d, args.t, args.l, args.k, args)
def newScoreWrapper(newickFile, switchLo, switchHi, lossLo, lossHi, D, T, L): """This function takes as input hostTree, parasiteTree, phi, duplication cost D, transfer cost T, and loss cost L, and returns the newDTL whose scores were calculated from costscape.""" H, P, phi = newickFormatReader.getInput(newickFile) originalDTL, numRecon, leaves = DP(H, P, phi, D, T, L) pointList = findCenters(newickFile, switchLo, switchHi, lossLo, lossHi) DTLPairs = getDTLVals(pointList) DTLList = getCostscapeDTLs(DTLPairs, H, P, phi) newDTL = changeDTLScores(originalDTL, DTLList) return newDTL, numRecon, leaves
def run_test(fileName, max_k): cache_dir = './cache' D = 2. T = 3. L = 1. host, paras, phi = newickFormatReader.getInput(fileName) if not os.path.exists(cache_dir): os.makedirs(cache_dir) f = open('%s/README' % cache_dir, 'w') f.write( 'This directory holds a cache of reconciliation graph for the TreeLife data set' ) f.close() cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1]) if not os.path.isfile(cache_location): print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file' print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L) f = open(cache_location, 'w+') f.write(repr(DictGraph)) f.close() print >> sys.stderr, 'Loading reonciliation graph from cache' f = open(cache_location) DictGraph = eval(f.read()) f.close() scoresList, dictReps = Greedy.Greedy(DictGraph, paras) print >> sys.stderr, 'Found cluster representatives using point-collecting' graph = ReconGraph.ReconGraph(DictGraph) setReps = [ ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps ] random.seed(0) extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)] representatives = setReps + extra_reps print >> sys.stderr, 'Starting K Means algorithm ... ' print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step' for i in xrange(1, max_k + 1): print 'k = %d' % i KMeans.k_means(graph, 10, i, 0, representatives[:i])
def freqSummation(argList): """Takes as input an argument list containing a newick file of host and parasite trees as well as their phi mapping, duplication, transfer, and loss costs, the type of frequency scoring to be used, as well as switch and loss cost ranges for xscape scoring, and returns a file containing the list of scores for each individual reconciliation, the sum of the those scores, the total cost of those reconciliations and the number of reconciliations of those trees.""" newickFile = argList[1] D = float(argList[2]) T = float(argList[3]) L = float(argList[4]) freqType = argList[5] switchLo = float(argList[6]) switchHi = float(argList[7]) lossLo = float(argList[8]) lossHi = float(argList[9]) fileName = newickFile[:-7] f = open(fileName + "freqFile.txt", 'w') host, paras, phi = newickFormatReader.getInput(newickFile) DTL, numRecon = DP.DP(host, paras, phi, D, T, L) if freqType == "Frequency": newDTL = DTL elif freqType == "xscape": newDTL = calcCostscapeScore.newScoreWrapper(newickFile, switchLo, switchHi, lossLo, lossHi, D, T, L) elif freqType == "unit": newDTL = MasterReconciliation.unitScoreDTL(host, paras, phi, D, T, L) scoresList, reconciliation = Greedy.Greedy(newDTL, paras) totalSum = 0 for score in scoresList: totalSum += score for index in reconciliation: totalCost = 0 for key in index: if index[key][0] == "L": totalCost += L elif index[key][0] == "T": totalCost += T elif index[key][0] == "D": totalCost += D f.write(str(scoresList) + '\n') f.write(str(totalSum) + '\n') f.write(str(totalCost) + '\n') f.write(str(numRecon)) f.close()
def freqSummation(argList): """Takes as input an argument list containing a newick file of host and parasite trees as well as their phi mapping, duplication, transfer, and loss costs, the type of frequency scoring to be used, as well as switch and loss cost ranges for xscape scoring, and returns a file containing the list of scores for each individual reconciliation, the sum of the those scores, the total cost of those reconciliations and the number of reconciliations of those trees.""" newickFile = argList[1] D = float(argList[2]) T = float(argList[3]) L = float(argList[4]) freqType = argList[5] switchLo = float(argList[6]) switchHi = float(argList[7]) lossLo = float(argList[8]) lossHi = float(argList[9]) fileName = newickFile[:-7] f = open(fileName+"freqFile.txt", 'w') host, paras, phi = newickFormatReader.getInput(newickFile) DTL, numRecon = DP.DP(host, paras, phi, D, T, L) print numRecon if freqType == "Frequency": newDTL = DTL elif freqType == "xscape": newDTL = calcCostscapeScore.newScoreWrapper(newickFile, switchLo, \ switchHi, lossLo, lossHi, D, T, L) elif freqType == "unit": newDTL = MasterReconciliation.unitScoreDTL(host, paras, phi, D, T, L) scoresList, reconciliation = Greedy.Greedy(newDTL, paras) totalSum = 0 for score in scoresList: totalSum +=score for index in reconciliation: totalCost = 0 for key in index: if index[key][0] == "L": totalCost+=L elif index[key][0] == "T": totalCost+=T elif index[key][0] == "D": totalCost+=D f.write(str(scoresList)+'\n') f.write(str(totalSum)+'\n') f.write(str(totalCost)+'\n') f.write(str(numRecon)) f.close()
def run_test(fileName, max_k): cache_dir = './cache' D = 2. T = 3. L = 1. host, paras, phi = newickFormatReader.getInput(fileName) if not os.path.exists(cache_dir): os.makedirs(cache_dir) f = open('%s/README' % cache_dir, 'w') f.write('This directory holds a cache of reconciliation graph for the TreeLife data set') f.close() cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1]) if not os.path.isfile(cache_location): print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file' print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L) f = open(cache_location, 'w+') f.write(repr(DictGraph)) f.close() print >> sys.stderr, 'Loading reonciliation graph from cache' f = open(cache_location) DictGraph = eval(f.read()) f.close() scoresList, dictReps = Greedy.Greedy(DictGraph, paras) print >> sys.stderr, 'Found cluster representatives using point-collecting' graph = ReconGraph.ReconGraph(DictGraph) setReps = [ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps] random.seed(0) extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)] representatives = setReps + extra_reps print >> sys.stderr, 'Starting K Means algorithm ... ' print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step' for i in xrange(1, max_k + 1): print 'k = %d' % i KMeans.k_means(graph, 10, i, 0, representatives[:i])
def Reconcile(argList): """Takes command-line arguments of a .newick file, duplication, transfer, and loss costs, the type of scoring desired and possible switch and loss ranges. Creates Files for the host, parasite, and reconciliations""" fileName = argList[1] #.newick file D = float(argList[2]) # Duplication cost T = float(argList[3]) # Transfer cost L = float(argList[4]) # Loss cost freqType = argList[5] # Frequency type # Optional inputs if freqType == xscape switchLo = float(argList[6]) # Switch lower boundary switchHi = float(argList[7]) # Switch upper boundary lossLo = float(argList[8]) # Loss lower boundary lossHi = float(argList[9]) # Loss upper boundary host, paras, phi = newickFormatReader.getInput(fileName) hostRoot = cycleCheckingGraph.findRoot(host) hostv = cycleCheckingGraph.treeFormat(host) Order = orderGraph.date(hostv) # Default scoring function (if freqtype== Frequency scoring) DTLReconGraph, numRecon = DP.DP(host, paras, phi, D, T, L) print DTLReconGraph, numRecon #uses xScape scoring function if freqType == "xscape": DTLReconGraph = calcCostscapeScore.newScoreWrapper(fileName, switchLo, \ switchHi, lossLo, lossHi, D, T, L) #uses Unit scoring function elif freqType == "unit": DTLReconGraph = unitScoreDTL(host, paras, phi, D, T, L) DTLGraph = copy.deepcopy(DTLReconGraph) scoresList, rec = Greedy.Greedy(DTLGraph, paras) for n in range(len(rec)): graph = cycleCheckingGraph.buildReconciliation(host, paras, rec[n]) currentOrder = orderGraph.date(graph) if currentOrder == "timeTravel": rec[n], currentOrder = detectCycles.detectCyclesWrapper( host, paras, rec[n]) currentOrder = orderGraph.date(currentOrder) hostOrder = hOrder(hostv, currentOrder) hostBranchs = branch(hostv, hostOrder) if n == 0: newickToVis.convert(fileName, hostBranchs, n, 1) else: newickToVis.convert(fileName, hostBranchs, n, 0) # filename[:-7] is the file name minus the .newick reconConversion.convert(rec[n], DTLReconGraph, paras, fileName[:-7], n)
def main(): if not os.path.exists("fixerOut"): os.mkdir("fixerOut") for i in xrange(fileNum): index = str(i + 1) for j in xrange(4 - len(str(i + 1))): index = "0" + index fileName = "real-100taxa/COG" + index + ".newick" if not os.path.isfile(fileName): continue outFile = open("fixerOut/COG" + index + ".txt", 'w') print fileName[13:] outFile.write(fileName[13:] + "\n") S_dict, G_dict, _ = newickFormatReader.getInput(fileName) S, G = eteTreeReader(fileName) recs, allRecs = MasterReconciliation.Reconcile(["", fileName, str(dVal), str(tVal), str(lVal), "unit", "0", "1", "0", "1"]) totRecs = len(allRecs) print "# of Infeasible Reconciliations: {0}".format(len(recs)) outFile.write("# of Reconciliations: {0}\n".format(totRecs)) outFile.write("# of Infeasible Reconciliations: {0}\n".format(len(recs))) min_cost = None for T in recs: alpha = recon_tree_to_dtl(T) out(S, G, alpha, outFile) alpha, pull_up = temporal_consistency_fixer(G, G_dict, S, S_dict, alpha) cost = out(S, G, alpha, outFile) if min_cost is None or cost < min_cost: min_cost = cost print "number of operations: {0}".format(pull_up) outFile.write("number of operations: {0}\n".format(pull_up)) print "min total:", min_cost outFile.write("min total: " + str(min_cost) + "\n") outFile.close()
def Reconcile(argList): """Takes command-line arguments of a .newick file, duplication, transfer, and loss costs, the type of scoring desired and possible switch and loss ranges. Creates Files for the host, parasite, and reconciliations""" fileName = argList[1] #.newick file D = float(argList[2]) # Duplication cost T = float(argList[3]) # Transfer cost L = float(argList[4]) # Loss cost freqType = argList[5] # Frequency type # Optional inputs if freqType == xscape switchLo = float(argList[6]) # Switch lower boundary switchHi = float(argList[7]) # Switch upper boundary lossLo = float(argList[8]) # Loss lower boundary lossHi = float(argList[9]) # Loss upper boundary host, paras, phi = newickFormatReader.getInput(fileName) hostRoot = cycleCheckingGraph.findRoot(host) hostv = cycleCheckingGraph.treeFormat(host) Order = orderGraph.date(hostv) # Default scoring function (if freqtype== Frequency scoring) DTLReconGraph, numRecon = DP.DP(host, paras, phi, D, T, L) print DTLReconGraph, numRecon #uses xScape scoring function if freqType == "xscape": DTLReconGraph = calcCostscapeScore.newScoreWrapper(fileName, switchLo, \ switchHi, lossLo, lossHi, D, T, L) #uses Unit scoring function elif freqType == "unit": DTLReconGraph = unitScoreDTL(host, paras, phi, D, T, L) DTLGraph = copy.deepcopy(DTLReconGraph) scoresList, rec = Greedy.Greedy(DTLGraph, paras) for n in range(len(rec)): graph = cycleCheckingGraph.buildReconciliation(host, paras, rec[n]) currentOrder = orderGraph.date(graph) if currentOrder == "timeTravel": rec[n], currentOrder = detectCycles.detectCyclesWrapper(host, paras, rec[n]) currentOrder = orderGraph.date(currentOrder) hostOrder = hOrder(hostv,currentOrder) hostBranchs = branch(hostv,hostOrder) if n == 0: newickToVis.convert(fileName,hostBranchs, n, 1) else: newickToVis.convert(fileName,hostBranchs, n, 0) # filename[:-7] is the file name minus the .newick reconConversion.convert(rec[n], DTLReconGraph, paras, fileName[:-7], n)
def reconcile(file_name, dup_cost, transfer_cost, loss_cost): """ :param file_name: the file in which the desired data set it stored, passed as a string. For Ran Libeskind-Hadas's/Jessica Wu's group, our data files were almost exclusively .newick files once we were sure our algorithm worked correctly, which needed to use the newick format reader to correctly read in the data. :param dup_cost: the cost associated with a duplication event :param transfer_cost: the cost associated with a transfer event :param loss_cost: the cost associated with a loss event :return: the host tree used, the parasite tree used, the DTLReconGraph, the number of MPRs (as an int), and a list of the roots that could be used to produce an MPR for the given trees. See preceding functions for details on the format of the host and parasite trees as well as the DTLReconGraph """ # Note: I have made modifications to the return statement to make Diameter.py possible without re-reconciling. host, paras, phi = newickFormatReader.getInput(file_name) graph, best_cost, num_recon, best_roots = DP(host, paras, phi, dup_cost, transfer_cost, loss_cost) return host, paras, graph, num_recon, best_roots
def freqSummation(argList): """Takes as input an argument list containing a newick file of host and parasite trees as well as their phi mapping, duplication, transfer, and loss costs, the type of frequency scoring to be used, as well as switch and loss cost ranges for xscape scoring, and returns a file containing the list of scores for each individual reconciliation, the sum of the those scores, the total cost of those reconciliations and the number of reconciliations of those trees.""" newickFile = argList[0] costs = {} costs['D'] = float(argList[1]) costs['T'] = float(argList[2]) costs['L'] = float(argList[3]) freqType = argList[4] switchLo = float(argList[5]) switchHi = float(argList[6]) lossLo = float(argList[7]) lossHi = float(argList[8]) fileName = newickFile[:-7] f = open("{}freqFile.txt".format(fileName), 'w') host, paras, phi = newickFormatReader.getInput(newickFile) DTL, numRecon = dp.DP(host, paras, phi, costs['D'], costs['T'], costs['L']) if freqType == "Frequency": newDTL = DTL elif freqType == "xscape": newDTL = calcCostscapeScore.newScoreWrapper(newickFile, switchLo, switchHi, lossLo, lossHi, costs['D'], costs['T'], costs['L']) elif freqType == "unit": newDTL = masterReconciliation.unitScoreDTL(host, paras, phi, costs['D'], costs['T'], costs['L']) scoresList, reconciliation = greedy.Greedy(newDTL, paras) totalSum = sum(scoresList) totalCost = 0 index = reconciliation[0] for key in index: totalCost += costs.get(index[key][0], 0) f.write("{}\n".format(scoresList)) f.write("{}\n".format(totalSum)) f.write("{}\n".format(totalCost)) f.write("{}".format(numRecon)) f.close()
def run_test(fileName, max_k): cache_dir = '../cache' D = 2. T = 3. L = 1. host, paras, phi = newickFormatReader.getInput(fileName) if not os.path.exists(cache_dir): os.makedirs(cache_dir) f = open('%s/README' % cache_dir, 'w') f.write('This directory holds a cache of reconciliation graph for the TreeLife data set') f.close() cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1]) recon_count_location = '%s/%s.count' % (cache_dir, os.path.split(fileName)[1]) if not(os.path.isfile(cache_location)) or not(os.path.isfile(recon_count_location)): print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file' print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L) f = open(cache_location, 'w+') g = open(recon_count_location, 'w+') f.write(repr(DictGraph)) g.write(str(numRecon)) f.close() g.close() print >> sys.stderr, 'Loading reonciliation graph from cache' #f = open(cache_location) g = open(recon_count_location) #DictGraph = eval(f.read()) numRecon = float(g.read()) #f.close() g.close() if (numRecon < recon_threshold): print >> sys.stderr, 'FALSE:\t', fileName, numRecon else: print >> sys.stderr, 'TRUE: \t', fileName, numRecon
def convert(fileName, HostOrder, n, writeParasite): """takes name of original .newick file and the dictionary of host tree branch lengths and creates files for the host + parasite trees. Parasite tree can be ommited if desired""" f = open(fileName, 'r') contents = f.read() host, paras, phi = newickFormatReader.getInput(fileName) hostRoot = cycleCheckingGraph.findRoot(host) f.close() H, P, phi = contents.split(";") P = P.strip() H = H.strip() H = H + ';' host = treelib1.parse_newick(H, HostOrder) for key in HostOrder: H = H.replace(str(key), str(key) + ':' + str(HostOrder[key])) f = open(fileName[:-7] + str(n) + ".stree", 'w') treelib1.write_newick(host, f, root_data=True) f.close() if writeParasite: f = open(fileName[:-7] + '.tree', 'w') f.write(P + ";") f.close()
def convert(fileName, HostOrder, n, writeParasite): """takes name of original .newick file and the dictionary of host tree branch lengths and creates files for the host + parasite trees. Parasite tree can be ommited if desired""" f = open(fileName, 'r') contents = f.read() host, paras, phi = newickFormatReader.getInput(fileName) hostRoot = cycleCheckingGraph.findRoot(host) f.close() H,P,phi = contents.split(";") P = P.strip() H = H.strip() H = H + ';' host = treelib1.parse_newick(H, HostOrder) for key in HostOrder: H = H.replace(str(key), str(key) + ':' + str(HostOrder[key])) f = open(fileName[:-7]+ str(n) +".stree", 'w') treelib1.write_newick(host, f, root_data = True) f.close() if writeParasite: f = open(fileName[:-7] + '.tree', 'w') f.write(P + ";") f.close()
def Reconcile(argList): """Takes command-line arguments of a .newick file, duplication, transfer, and loss costs, the type of scoring desired and possible switch and loss ranges. Creates Files for the host, parasite, and reconciliations""" fileName = argList[1] #.newick file D = float(argList[2]) # Duplication cost T = float(argList[3]) # Transfer cost L = float(argList[4]) # Loss cost freqType = argList[5] # Frequency type # Optional inputs if freqType == xscape switchLo = float(argList[6]) # Switch lower boundary switchHi = float(argList[7]) # Switch upper boundary lossLo = float(argList[8]) # Loss lower boundary lossHi = float(argList[9]) # Loss upper boundary host, paras, phi = newickFormatReader.getInput(fileName) hostRoot = ReconciliationGraph.findRoot(host) # Default scoring function (if freqtype== Frequency scoring) DTLReconGraph, numRecon = DP.DP(host, paras, phi, D, T, L) #uses xScape scoring function # if freqType == "xscape": # DTLReconGraph = calcCostscapeScore.newScoreWrapper(fileName, switchLo, \ # switchHi, lossLo, lossHi, D, T, L) #uses Unit scoring function if freqType == "unit": DTLReconGraph = unitScoreDTL(host, paras, phi, D, T, L) DTLGraph = copy.deepcopy(DTLReconGraph) scoresList, recs = Greedy.Greedy(DTLGraph, paras) infeasible_recs = [] for rec in recs: if orderGraph.date(ReconciliationGraph.buildReconciliation(host, paras, rec)) == False: infeasible_recs.append(rec) return infeasible_recs, recs
def run_test(fileName, max_k): cache_dir = './cache' D = 2. T = 3. L = 1. print >> sys.stderr, "FILE: ", fileName print fileName host, paras, phi = newickFormatReader.getInput(fileName) if not os.path.exists(cache_dir): os.makedirs(cache_dir) f = open('%s/README' % cache_dir, 'w') f.write('This directory holds a cache of reconciliation graph for the TreeLife data set') f.close() cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1]) recon_count_location = '%s/%s.count' % (cache_dir, os.path.split(fileName)[1]) if not(os.path.isfile(cache_location)) or not(os.path.isfile(recon_count_location)): print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file' print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L) f = open(cache_location, 'w+') g = open(recon_count_location, 'w+') f.write(repr(DictGraph)) g.write(str(numRecon)) f.close() g.close() print >> sys.stderr, 'Loading reonciliation graph from cache' f = open(cache_location) g = open(recon_count_location) DictGraph = eval(f.read()) numRecon = float(g.read()) f.close() g.close() ## Only consider running algorithm for reconciliations with more than # threshold MPRs if (numRecon < recon_threshold): print >> sys.stderr, 'Too few reconciliations: ', numRecon return else: print >> sys.stderr, 'Reconciliation Count: ', numRecon scoresList, dictReps = Greedy.Greedy(DictGraph, paras) print >> sys.stderr, 'Found cluster representatives using point-collecting' graph = ReconGraph.ReconGraph(DictGraph) setReps = [ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps] random.seed(0) extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)] representatives = setReps + extra_reps print >> sys.stderr, 'Starting K Means algorithm ... ' print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step' for seed in xrange(5): for i in xrange(1, max_k + 1): # print 'k = %d' % i # KMeans.k_means(graph, 10, i, 0, representatives[:i]) KMeans.k_means(graph, 10, i, seed, None) print
def main(): if not os.path.exists("treeFiles"): os.mkdir("treeFiles") for i in xrange(6000): index = str(i + 1) for j in xrange(4 - len(str(i + 1))): index = "0" + index inFile = "real-100taxa/COG" + index + ".newick" if not os.path.isfile(inFile): continue outFile = open("treeFiles/COG" + index + ".tree", 'w') host, parasite, phi = newickFormatReader.getInput(inFile) H = treeFormat(host) P = treeFormat(parasite) H_dict = {} # name:index P_dict = {} # name:index count = 0 for key in H: count += 1 H_dict[key] = count for key in P: count += 1 P_dict[key] = count outFile.write("HOSTTREE\n") for key in H: outFile.write(str(H_dict[key]) + "\t") if H[key] == [None, None]: outFile.write("null\tnull\n") else: outFile.write( str(H_dict[H[key][0]]) + "\t" + str(H_dict[H[key][1]]) + "\n") outFile.write("\nHOSTNAMES\n") for key in H: outFile.write(str(H_dict[key]) + "\t" + key + "\n") outFile.write("\nPARASITETREE\n") for key in P: outFile.write(str(P_dict[key]) + "\t") if P[key] == [None, None]: outFile.write("null\tnull\n") else: outFile.write( str(P_dict[P[key][0]]) + "\t" + str(P_dict[P[key][1]]) + "\n") outFile.write("\nPARASITENAMES\n") for key in P: outFile.write(str(P_dict[key]) + "\t" + key + "\n") outFile.write("\nPHI\n") for key in phi: outFile.write( str(H_dict[phi[key]]) + "\t" + str(P_dict[key]) + "\n") outFile.close()
def run_test(fileName, max_k): cache_dir = './cache' D = 2. T = 3. L = 1. print >> sys.stderr, "FILE: ", fileName print fileName host, paras, phi = newickFormatReader.getInput(fileName) if not os.path.exists(cache_dir): os.makedirs(cache_dir) f = open('%s/README' % cache_dir, 'w') f.write('This directory holds a cache of reconciliation graph for the TreeLife data set') f.close() cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1]) recon_count_location = '%s/%s.count' % (cache_dir, os.path.split(fileName)[1]) if not(os.path.isfile(cache_location)) or not(os.path.isfile(recon_count_location)): print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file' print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L) f = open(cache_location, 'w+') g = open(recon_count_location, 'w+') f.write(repr(DictGraph)) g.write(str(numRecon)) f.close() g.close() print >> sys.stderr, 'Loading reonciliation graph from cache' f = open(cache_location) g = open(recon_count_location) DictGraph = eval(f.read()) numRecon = float(g.read()) f.close() g.close() ## Only consider running algorithm for reconciliations with more than # threshold MPRs if (numRecon < recon_threshold): print >> sys.stderr, 'Too few reconciliations: ', numRecon return else: print >> sys.stderr, 'Reconciliation Count: ', numRecon scoresList, dictReps = Greedy.Greedy(DictGraph, paras) graph = ReconGraph.ReconGraph(DictGraph) representatives = [ReconGraph.dictRecToSetRec(graph, dictReps[0])] ## Debug info ## Modifies the graph ## Checking for the case when there is an error in likelihood print >> sys.stderr, "== Checking for likelihoods over 1 ==" found = False for key in DictGraph.keys(): children = DictGraph[key] for child in children[:-1]: if child[-1] > 1: # Attempt to round to fix large float math errors roundedValue = round(child[-1]) if roundedValue != 1.0: print >> sys.stderr, "ERR FOUND: ", key, child found = True if not(found): print >> sys.stderr, "NO ERR(s)" print >> sys.stderr, "== End of over 1 checks. ==" print >> sys.stderr, 'Starting K-centers algorithm ... ' for i in xrange(2, max_k + 2): d, newrep = maximize(graph,representatives) if not all(d_i > 0 for d_i in d): print >> sys.stderr, "Distance vector contains 0", d break print i-1, min(d), representatives.append(newrep) dist_sum = 0 n = 10 for _ in xrange(n): reps = [KMeans.get_weighted_template(graph) for _ in xrange(i-1)] dist_sum += min_d(maximize(graph,reps)) print float(dist_sum) / n print >> sys.stderr, "Finished k centers algorithm ..."
def main(): if not os.path.exists("treeFiles"): os.mkdir("treeFiles") for i in xrange(6000): index = str(i + 1) for j in xrange(4 - len(str(i + 1))): index = "0" + index inFile = "real-100taxa/COG" + index + ".newick" if not os.path.isfile(inFile): continue outFile = open("treeFiles/COG" + index + ".tree", 'w') host, parasite, phi = newickFormatReader.getInput(inFile) H = treeFormat(host) P = treeFormat(parasite) H_dict = {} # name:index P_dict = {} # name:index count = 0 for key in H: count += 1 H_dict[key] = count for key in P: count += 1 P_dict[key] = count outFile.write("HOSTTREE\n") for key in H: outFile.write(str(H_dict[key]) + "\t") if H[key] == [None, None]: outFile.write("null\tnull\n") else: outFile.write(str(H_dict[H[key][0]]) + "\t" + str(H_dict[H[key][1]]) + "\n") outFile.write("\nHOSTNAMES\n") for key in H: outFile.write(str(H_dict[key]) + "\t" + key + "\n") outFile.write("\nPARASITETREE\n") for key in P: outFile.write(str(P_dict[key]) + "\t") if P[key] == [None, None]: outFile.write("null\tnull\n") else: outFile.write(str(P_dict[P[key][0]]) + "\t" + str(P_dict[P[key][1]]) + "\n") outFile.write("\nPARASITENAMES\n") for key in P: outFile.write(str(P_dict[key]) + "\t" + key + "\n") outFile.write("\nPHI\n") for key in phi: outFile.write(str(H_dict[phi[key]]) + "\t" + str(P_dict[key]) + "\n") outFile.close()