Esempio n. 1
0
def reconcile(fileName, D, T, L):
    """Takes as input a newick file, FileName, a dupliction cost, a transfer 
    cost, and a loss cost. This uses newickFormatReader to extract the host 
    tree, parasite tree and tip mapping from the file and then calls DP to 
    return the DTL reconciliation graph of the provided newick file"""
    host, paras, phi = newickFormatReader.getInput(fileName)
    return DP(host, paras, phi, D, T, L)
Esempio n. 2
0
def reconcile(fileName, D, T, L):
    """Takes as input a newick file, FileName, a dupliction cost, a transfer 
    cost, and a loss cost. This uses newickFormatReader to extract the host 
    tree, parasite tree and tip mapping from the file and then calls DP to 
    return the DTL reconciliation graph of the provided newick file"""
    host, paras, phi = newickFormatReader.getInput(fileName)
    return DP(host, paras, phi, D, T, L)
Esempio n. 3
0
def main():
    args = process_arg()
    newick_data = getInput(args.filename)
    if args.functionality == "costscape":
        costscape.solve(newick_data, args.tl, args.th, args.ll, args.lh, args)
    elif args.functionality == "reconcile":
        DTLReconGraph.reconcile_noninter(newick_data, args.d, args.t, args.l)
    elif args.functionality == "histogram":
        HistogramMain.compute_pdv(args.filename, newick_data, args.d, args.t, args.l, args)
    elif args.functionality == "clumpr":
        ClusterMain.perform_clustering(newick_data, args.d, args.t, args.l, args.k, args)
Esempio n. 4
0
def newScoreWrapper(newickFile, switchLo, switchHi, lossLo, lossHi, D, T, L):
    """This function takes as input hostTree, parasiteTree, phi, duplication 
	cost D, transfer cost T, and loss cost L, and returns the newDTL whose 
	scores were calculated from costscape."""

    H, P, phi = newickFormatReader.getInput(newickFile)
    originalDTL, numRecon, leaves = DP(H, P, phi, D, T, L)
    pointList = findCenters(newickFile, switchLo, switchHi, lossLo, lossHi)
    DTLPairs = getDTLVals(pointList)
    DTLList = getCostscapeDTLs(DTLPairs, H, P, phi)
    newDTL = changeDTLScores(originalDTL, DTLList)
    return newDTL, numRecon, leaves
def newScoreWrapper(newickFile, switchLo, switchHi, lossLo, lossHi, D, T, L):
    """This function takes as input hostTree, parasiteTree, phi, duplication 
	cost D, transfer cost T, and loss cost L, and returns the newDTL whose 
	scores were calculated from costscape."""

    H, P, phi = newickFormatReader.getInput(newickFile)
    originalDTL, numRecon, leaves = DP(H, P, phi, D, T, L)
    pointList = findCenters(newickFile, switchLo, switchHi, lossLo, lossHi)
    DTLPairs = getDTLVals(pointList)
    DTLList = getCostscapeDTLs(DTLPairs, H, P, phi)
    newDTL = changeDTLScores(originalDTL, DTLList)
    return newDTL, numRecon, leaves
def run_test(fileName, max_k):
    cache_dir = './cache'
    D = 2.
    T = 3.
    L = 1.

    host, paras, phi = newickFormatReader.getInput(fileName)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        f = open('%s/README' % cache_dir, 'w')
        f.write(
            'This directory holds a cache of reconciliation graph for the TreeLife data set'
        )
        f.close()

    cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1])
    if not os.path.isfile(cache_location):
        print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file'
        print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location

        DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L)

        f = open(cache_location, 'w+')
        f.write(repr(DictGraph))
        f.close()

    print >> sys.stderr, 'Loading reonciliation graph from cache'
    f = open(cache_location)
    DictGraph = eval(f.read())
    f.close()

    scoresList, dictReps = Greedy.Greedy(DictGraph, paras)

    print >> sys.stderr, 'Found cluster representatives using point-collecting'

    graph = ReconGraph.ReconGraph(DictGraph)
    setReps = [
        ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps
    ]
    random.seed(0)
    extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)]

    representatives = setReps + extra_reps

    print >> sys.stderr, 'Starting K Means algorithm ... '
    print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step'

    for i in xrange(1, max_k + 1):
        print 'k = %d' % i
        KMeans.k_means(graph, 10, i, 0, representatives[:i])
def freqSummation(argList):
    """Takes as input an argument list containing a newick file of host and 
	parasite trees as well as their phi mapping, duplication, transfer, and 
	loss costs, the type of frequency scoring to be used, as well as switch 
	and loss cost ranges for xscape scoring, and returns a file containing the
	list of scores for each individual reconciliation, the sum of the those 
	scores, the total cost of those reconciliations and the number of 
	reconciliations of those trees."""
    newickFile = argList[1]
    D = float(argList[2])
    T = float(argList[3])
    L = float(argList[4])
    freqType = argList[5]
    switchLo = float(argList[6])
    switchHi = float(argList[7])
    lossLo = float(argList[8])
    lossHi = float(argList[9])
    fileName = newickFile[:-7]
    f = open(fileName + "freqFile.txt", 'w')
    host, paras, phi = newickFormatReader.getInput(newickFile)
    DTL, numRecon = DP.DP(host, paras, phi, D, T, L)
    if freqType == "Frequency":
        newDTL = DTL
    elif freqType == "xscape":
        newDTL = calcCostscapeScore.newScoreWrapper(newickFile, switchLo,
                                                    switchHi, lossLo, lossHi,
                                                    D, T, L)
    elif freqType == "unit":
        newDTL = MasterReconciliation.unitScoreDTL(host, paras, phi, D, T, L)
    scoresList, reconciliation = Greedy.Greedy(newDTL, paras)
    totalSum = 0
    for score in scoresList:
        totalSum += score
    for index in reconciliation:
        totalCost = 0
        for key in index:
            if index[key][0] == "L":
                totalCost += L
            elif index[key][0] == "T":
                totalCost += T
            elif index[key][0] == "D":
                totalCost += D
    f.write(str(scoresList) + '\n')
    f.write(str(totalSum) + '\n')
    f.write(str(totalCost) + '\n')
    f.write(str(numRecon))
    f.close()
def freqSummation(argList):
	"""Takes as input an argument list containing a newick file of host and 
	parasite trees as well as their phi mapping, duplication, transfer, and 
	loss costs, the type of frequency scoring to be used, as well as switch 
	and loss cost ranges for xscape scoring, and returns a file containing the
	list of scores for each individual reconciliation, the sum of the those 
	scores, the total cost of those reconciliations and the number of 
	reconciliations of those trees."""
	newickFile = argList[1]
	D = float(argList[2])
	T = float(argList[3])
	L = float(argList[4])
	freqType = argList[5]
	switchLo = float(argList[6])
	switchHi = float(argList[7])
	lossLo = float(argList[8])
	lossHi = float(argList[9])
	fileName = newickFile[:-7]
	f = open(fileName+"freqFile.txt", 'w')
	host, paras, phi = newickFormatReader.getInput(newickFile)
	DTL, numRecon = DP.DP(host, paras, phi, D, T, L)
	print numRecon
	if freqType == "Frequency":
		newDTL = DTL
	elif freqType == "xscape":
		newDTL = calcCostscapeScore.newScoreWrapper(newickFile, switchLo, \
			switchHi, lossLo, lossHi, D, T, L)
	elif freqType == "unit":
		newDTL = MasterReconciliation.unitScoreDTL(host, paras, phi, D, T, L)
	scoresList, reconciliation = Greedy.Greedy(newDTL, paras)
	totalSum = 0
	for score in scoresList:
		totalSum +=score
	for index in reconciliation:
		totalCost = 0
		for key in index:
			if index[key][0] == "L":
				totalCost+=L
			elif index[key][0] == "T":
				totalCost+=T
			elif index[key][0] == "D":
				totalCost+=D
	f.write(str(scoresList)+'\n')
	f.write(str(totalSum)+'\n')
	f.write(str(totalCost)+'\n')
	f.write(str(numRecon))
	f.close()
def run_test(fileName, max_k):
    cache_dir = './cache'
    D = 2.
    T = 3.
    L = 1.

    host, paras, phi = newickFormatReader.getInput(fileName)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        f = open('%s/README' % cache_dir, 'w')
        f.write('This directory holds a cache of reconciliation graph for the TreeLife data set')
        f.close()

    cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1])
    if not os.path.isfile(cache_location):
        print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file'
        print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location

        DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L)

        f = open(cache_location, 'w+')
        f.write(repr(DictGraph))
        f.close()

    print >> sys.stderr, 'Loading reonciliation graph from cache'
    f = open(cache_location)
    DictGraph = eval(f.read())
    f.close()

    scoresList, dictReps = Greedy.Greedy(DictGraph, paras)

    print >> sys.stderr, 'Found cluster representatives using point-collecting'

    graph = ReconGraph.ReconGraph(DictGraph)
    setReps = [ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps]
    random.seed(0)
    extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)]

    representatives = setReps + extra_reps

    print >> sys.stderr, 'Starting K Means algorithm ... '
    print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step'

    for i in xrange(1, max_k + 1):
        print 'k = %d' % i
        KMeans.k_means(graph, 10, i, 0, representatives[:i])
Esempio n. 10
0
def Reconcile(argList):
    """Takes command-line arguments of a .newick file, duplication, transfer, 
	and loss costs, the type of scoring desired and possible switch and loss 
	ranges. Creates Files for the host, parasite, and reconciliations"""
    fileName = argList[1]  #.newick file
    D = float(argList[2])  # Duplication cost
    T = float(argList[3])  # Transfer cost
    L = float(argList[4])  # Loss cost
    freqType = argList[5]  # Frequency type
    # Optional inputs if freqType == xscape
    switchLo = float(argList[6])  # Switch lower boundary
    switchHi = float(argList[7])  # Switch upper boundary
    lossLo = float(argList[8])  # Loss lower boundary
    lossHi = float(argList[9])  # Loss upper boundary

    host, paras, phi = newickFormatReader.getInput(fileName)
    hostRoot = cycleCheckingGraph.findRoot(host)
    hostv = cycleCheckingGraph.treeFormat(host)
    Order = orderGraph.date(hostv)
    # Default scoring function (if freqtype== Frequency scoring)
    DTLReconGraph, numRecon = DP.DP(host, paras, phi, D, T, L)
    print DTLReconGraph, numRecon
    #uses xScape scoring function
    if freqType == "xscape":
        DTLReconGraph = calcCostscapeScore.newScoreWrapper(fileName, switchLo, \
         switchHi, lossLo, lossHi, D, T, L)
    #uses Unit scoring function
    elif freqType == "unit":
        DTLReconGraph = unitScoreDTL(host, paras, phi, D, T, L)

    DTLGraph = copy.deepcopy(DTLReconGraph)
    scoresList, rec = Greedy.Greedy(DTLGraph, paras)
    for n in range(len(rec)):
        graph = cycleCheckingGraph.buildReconciliation(host, paras, rec[n])
        currentOrder = orderGraph.date(graph)
        if currentOrder == "timeTravel":
            rec[n], currentOrder = detectCycles.detectCyclesWrapper(
                host, paras, rec[n])
            currentOrder = orderGraph.date(currentOrder)
        hostOrder = hOrder(hostv, currentOrder)
        hostBranchs = branch(hostv, hostOrder)
        if n == 0:
            newickToVis.convert(fileName, hostBranchs, n, 1)
        else:
            newickToVis.convert(fileName, hostBranchs, n, 0)
        # filename[:-7] is the file name minus the .newick
        reconConversion.convert(rec[n], DTLReconGraph, paras, fileName[:-7], n)
Esempio n. 11
0
def main():

    if not os.path.exists("fixerOut"):
        os.mkdir("fixerOut")

    for i in xrange(fileNum):

        index = str(i + 1)
        for j in xrange(4 - len(str(i + 1))):
            index = "0" + index

        fileName = "real-100taxa/COG" + index + ".newick"
        if not os.path.isfile(fileName):
            continue

        outFile = open("fixerOut/COG" + index + ".txt", 'w')

        print fileName[13:]
        outFile.write(fileName[13:] + "\n")

        S_dict, G_dict, _ = newickFormatReader.getInput(fileName)
        S, G = eteTreeReader(fileName)
        recs, allRecs = MasterReconciliation.Reconcile(["", fileName, str(dVal), str(tVal), str(lVal), "unit", "0", "1", "0", "1"])

        totRecs = len(allRecs)

        print "# of Infeasible Reconciliations: {0}".format(len(recs))
        outFile.write("# of Reconciliations: {0}\n".format(totRecs))
        outFile.write("# of Infeasible Reconciliations: {0}\n".format(len(recs)))

        min_cost = None

        for T in recs:
            alpha = recon_tree_to_dtl(T)
            out(S, G, alpha, outFile)
            alpha, pull_up = temporal_consistency_fixer(G, G_dict, S, S_dict, alpha)
            cost = out(S, G, alpha, outFile)
            if min_cost is None or cost < min_cost:
                min_cost = cost
            print "number of operations: {0}".format(pull_up)
            outFile.write("number of operations: {0}\n".format(pull_up))

        print "min total:", min_cost
        outFile.write("min total: " + str(min_cost) + "\n")

        outFile.close()
def Reconcile(argList):
    """Takes command-line arguments of a .newick file, duplication, transfer, 
    and loss costs, the type of scoring desired and possible switch and loss 
    ranges. Creates Files for the host, parasite, and reconciliations"""
    fileName = argList[1] #.newick file
    D = float(argList[2]) # Duplication cost
    T = float(argList[3]) # Transfer cost
    L = float(argList[4]) # Loss cost
    freqType = argList[5] # Frequency type
    # Optional inputs if freqType == xscape
    switchLo = float(argList[6]) # Switch lower boundary
    switchHi = float(argList[7]) # Switch upper boundary
    lossLo = float(argList[8]) # Loss lower boundary
    lossHi = float(argList[9]) # Loss upper boundary

    host, paras, phi = newickFormatReader.getInput(fileName)
    hostRoot = cycleCheckingGraph.findRoot(host)
    hostv = cycleCheckingGraph.treeFormat(host)
    Order = orderGraph.date(hostv)
    # Default scoring function (if freqtype== Frequency scoring)
    DTLReconGraph, numRecon = DP.DP(host, paras, phi, D, T, L)
    print DTLReconGraph, numRecon
    #uses xScape scoring function
    if freqType == "xscape":
        DTLReconGraph = calcCostscapeScore.newScoreWrapper(fileName, switchLo, \
            switchHi, lossLo, lossHi, D, T, L)
    #uses Unit scoring function
    elif freqType == "unit":
        DTLReconGraph = unitScoreDTL(host, paras, phi, D, T, L)

    DTLGraph = copy.deepcopy(DTLReconGraph)
    scoresList, rec = Greedy.Greedy(DTLGraph, paras)
    for n in range(len(rec)):
        graph = cycleCheckingGraph.buildReconciliation(host, paras, rec[n])
        currentOrder = orderGraph.date(graph)
        if currentOrder == "timeTravel":
            rec[n], currentOrder = detectCycles.detectCyclesWrapper(host, paras, rec[n])
            currentOrder = orderGraph.date(currentOrder)
        hostOrder = hOrder(hostv,currentOrder)
        hostBranchs = branch(hostv,hostOrder)
        if n == 0:
            newickToVis.convert(fileName,hostBranchs, n, 1)
        else:
            newickToVis.convert(fileName,hostBranchs, n, 0)
        # filename[:-7] is the file name minus the .newick
        reconConversion.convert(rec[n], DTLReconGraph, paras, fileName[:-7], n)
Esempio n. 13
0
def reconcile(file_name, dup_cost, transfer_cost, loss_cost):
    """
    :param file_name: the file in which the desired data set it stored, passed as
    a string. For Ran Libeskind-Hadas's/Jessica Wu's group, our data files were almost exclusively
    .newick files once we were sure our algorithm worked correctly, which needed to use
    the newick format reader to correctly read in the data.
    :param dup_cost: the cost associated with a duplication event
    :param transfer_cost: the cost associated with a transfer event
    :param loss_cost: the cost associated with a loss event
    :return: the host tree used, the parasite tree used, the DTLReconGraph, the number of MPRs (as an int), and
    a list of the roots that could be used to produce an MPR for the given trees. See preceding functions
    for details on the format of the host and parasite trees as well as the DTLReconGraph
    """
    # Note: I have made modifications to the return statement to make Diameter.py possible without re-reconciling.
    host, paras, phi = newickFormatReader.getInput(file_name)
    graph, best_cost, num_recon, best_roots = DP(host, paras, phi, dup_cost,
                                                 transfer_cost, loss_cost)
    return host, paras, graph, num_recon, best_roots
Esempio n. 14
0
def freqSummation(argList):
    """Takes as input an argument list containing a newick file of host and
    parasite trees as well as their phi mapping, duplication, transfer, and
    loss costs, the type of frequency scoring to be used, as well as switch
    and loss cost ranges for xscape scoring, and returns a file containing the
    list of scores for each individual reconciliation, the sum of the those
    scores, the total cost of those reconciliations and the number of
    reconciliations of those trees."""
    newickFile = argList[0]
    costs = {}
    costs['D'] = float(argList[1])
    costs['T'] = float(argList[2])
    costs['L'] = float(argList[3])
    freqType = argList[4]
    switchLo = float(argList[5])
    switchHi = float(argList[6])
    lossLo = float(argList[7])
    lossHi = float(argList[8])
    fileName = newickFile[:-7]
    f = open("{}freqFile.txt".format(fileName), 'w')
    host, paras, phi = newickFormatReader.getInput(newickFile)
    DTL, numRecon = dp.DP(host, paras, phi, costs['D'], costs['T'], costs['L'])
    if freqType == "Frequency":
        newDTL = DTL
    elif freqType == "xscape":
        newDTL = calcCostscapeScore.newScoreWrapper(newickFile, switchLo, switchHi, lossLo, lossHi, costs['D'],
                                                    costs['T'], costs['L'])
    elif freqType == "unit":
        newDTL = masterReconciliation.unitScoreDTL(host, paras, phi, costs['D'], costs['T'], costs['L'])
    scoresList, reconciliation = greedy.Greedy(newDTL, paras)
    totalSum = sum(scoresList)
    totalCost = 0
    index = reconciliation[0]
    for key in index:
        totalCost += costs.get(index[key][0], 0)

    f.write("{}\n".format(scoresList))
    f.write("{}\n".format(totalSum))
    f.write("{}\n".format(totalCost))
    f.write("{}".format(numRecon))
    f.close()
def run_test(fileName, max_k):
    cache_dir = '../cache'
    D = 2.
    T = 3.
    L = 1.
    host, paras, phi = newickFormatReader.getInput(fileName)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        f = open('%s/README' % cache_dir, 'w')
        f.write('This directory holds a cache of reconciliation graph for the TreeLife data set')
        f.close()

    cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1])
    recon_count_location = '%s/%s.count' % (cache_dir, os.path.split(fileName)[1])

    if not(os.path.isfile(cache_location)) or not(os.path.isfile(recon_count_location)):
        print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file'
        print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location

        DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L)
        f = open(cache_location, 'w+')
        g = open(recon_count_location, 'w+')
        f.write(repr(DictGraph))
        g.write(str(numRecon))
        f.close()
        g.close()

    print >> sys.stderr, 'Loading reonciliation graph from cache'
    #f = open(cache_location)
    g = open(recon_count_location)
    #DictGraph = eval(f.read())
    numRecon = float(g.read())
    #f.close()
    g.close()

    if (numRecon < recon_threshold):
        print >> sys.stderr, 'FALSE:\t', fileName, numRecon
    else:
        print >> sys.stderr, 'TRUE: \t', fileName, numRecon
def convert(fileName, HostOrder, n, writeParasite):
    """takes name of original .newick file and the dictionary of host tree branch lengths
    and creates files for the host + parasite trees. Parasite tree can
    be ommited if desired"""
    f = open(fileName, 'r')
    contents = f.read()
    host, paras, phi = newickFormatReader.getInput(fileName)
    hostRoot = cycleCheckingGraph.findRoot(host)
    f.close()
    H, P, phi = contents.split(";")
    P = P.strip()
    H = H.strip()
    H = H + ';'
    host = treelib1.parse_newick(H, HostOrder)
    for key in HostOrder:
        H = H.replace(str(key), str(key) + ':' + str(HostOrder[key]))
    f = open(fileName[:-7] + str(n) + ".stree", 'w')
    treelib1.write_newick(host, f, root_data=True)
    f.close()
    if writeParasite:
        f = open(fileName[:-7] + '.tree', 'w')
        f.write(P + ";")
        f.close()
def convert(fileName, HostOrder, n, writeParasite):
    """takes name of original .newick file and the dictionary of host tree branch lengths
    and creates files for the host + parasite trees. Parasite tree can
    be ommited if desired"""
    f = open(fileName, 'r')
    contents = f.read()
    host, paras, phi = newickFormatReader.getInput(fileName)
    hostRoot = cycleCheckingGraph.findRoot(host)
    f.close()
    H,P,phi = contents.split(";")
    P = P.strip()
    H = H.strip()
    H = H + ';'
    host = treelib1.parse_newick(H, HostOrder)
    for key in HostOrder:
        H = H.replace(str(key), str(key) + ':' + str(HostOrder[key]))
    f = open(fileName[:-7]+ str(n) +".stree", 'w')
    treelib1.write_newick(host, f, root_data = True)
    f.close()
    if writeParasite:
        f = open(fileName[:-7] + '.tree', 'w')
        f.write(P + ";")
        f.close()
Esempio n. 18
0
def Reconcile(argList):
	"""Takes command-line arguments of a .newick file, duplication, transfer, 
	and loss costs, the type of scoring desired and possible switch and loss 
	ranges. Creates Files for the host, parasite, and reconciliations"""
	fileName = argList[1] #.newick file
	D = float(argList[2]) # Duplication cost
	T = float(argList[3]) # Transfer cost
	L = float(argList[4]) # Loss cost
	freqType = argList[5] # Frequency type
	# Optional inputs if freqType == xscape
	switchLo = float(argList[6]) # Switch lower boundary
	switchHi = float(argList[7]) # Switch upper boundary
	lossLo = float(argList[8]) # Loss lower boundary
	lossHi = float(argList[9]) # Loss upper boundary

	host, paras, phi = newickFormatReader.getInput(fileName)
	hostRoot = ReconciliationGraph.findRoot(host)
	# Default scoring function (if freqtype== Frequency scoring)
	DTLReconGraph, numRecon = DP.DP(host, paras, phi, D, T, L)
	#uses xScape scoring function
	# if freqType == "xscape":
	# 	DTLReconGraph = calcCostscapeScore.newScoreWrapper(fileName, switchLo, \
	# 		switchHi, lossLo, lossHi, D, T, L)
	#uses Unit scoring function
	if freqType == "unit":
		DTLReconGraph = unitScoreDTL(host, paras, phi, D, T, L)

	DTLGraph = copy.deepcopy(DTLReconGraph)
	scoresList, recs = Greedy.Greedy(DTLGraph, paras)

	infeasible_recs = []
	for rec in recs:
		if orderGraph.date(ReconciliationGraph.buildReconciliation(host, paras, rec)) == False:
			infeasible_recs.append(rec)

	return infeasible_recs, recs
def run_test(fileName, max_k):
    cache_dir = './cache'
    D = 2.
    T = 3.
    L = 1.

    print >> sys.stderr, "FILE: ", fileName
    print fileName


    host, paras, phi = newickFormatReader.getInput(fileName)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        f = open('%s/README' % cache_dir, 'w')
        f.write('This directory holds a cache of reconciliation graph for the TreeLife data set')
        f.close()

    cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1])
    recon_count_location = '%s/%s.count' % (cache_dir, os.path.split(fileName)[1])
    if not(os.path.isfile(cache_location)) or not(os.path.isfile(recon_count_location)):
        print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file'
        print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location

        DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L)
        f = open(cache_location, 'w+')
        g = open(recon_count_location, 'w+')
        f.write(repr(DictGraph))
        g.write(str(numRecon))
        f.close()
        g.close()

    print >> sys.stderr, 'Loading reonciliation graph from cache'
    f = open(cache_location)
    g = open(recon_count_location)
    DictGraph = eval(f.read())
    numRecon = float(g.read())
    f.close()
    g.close()

    ## Only consider running algorithm for reconciliations with more than 
    # threshold MPRs
    if (numRecon < recon_threshold):
        print >> sys.stderr, 'Too few reconciliations: ', numRecon
        return 
    else:
        print >> sys.stderr, 'Reconciliation Count: ', numRecon



    scoresList, dictReps = Greedy.Greedy(DictGraph, paras)

    print >> sys.stderr, 'Found cluster representatives using point-collecting'

    graph = ReconGraph.ReconGraph(DictGraph)
    setReps = [ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps]
    random.seed(0)
    extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)]

    representatives = setReps + extra_reps

    print >> sys.stderr, 'Starting K Means algorithm ... '
    print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step'

    for seed in xrange(5):
        for i in xrange(1, max_k + 1):
            # print 'k = %d' % i
            # KMeans.k_means(graph, 10, i, 0, representatives[:i])
            KMeans.k_means(graph, 10, i, seed, None)
            print
Esempio n. 20
0
def main():

    if not os.path.exists("treeFiles"):
        os.mkdir("treeFiles")

    for i in xrange(6000):

        index = str(i + 1)

        for j in xrange(4 - len(str(i + 1))):
            index = "0" + index

        inFile = "real-100taxa/COG" + index + ".newick"

        if not os.path.isfile(inFile):
            continue

        outFile = open("treeFiles/COG" + index + ".tree", 'w')

        host, parasite, phi = newickFormatReader.getInput(inFile)
        H = treeFormat(host)
        P = treeFormat(parasite)

        H_dict = {}  # name:index
        P_dict = {}  # name:index

        count = 0
        for key in H:
            count += 1
            H_dict[key] = count

        for key in P:
            count += 1
            P_dict[key] = count

        outFile.write("HOSTTREE\n")
        for key in H:
            outFile.write(str(H_dict[key]) + "\t")
            if H[key] == [None, None]:
                outFile.write("null\tnull\n")
            else:
                outFile.write(
                    str(H_dict[H[key][0]]) + "\t" + str(H_dict[H[key][1]]) +
                    "\n")

        outFile.write("\nHOSTNAMES\n")
        for key in H:
            outFile.write(str(H_dict[key]) + "\t" + key + "\n")

        outFile.write("\nPARASITETREE\n")
        for key in P:
            outFile.write(str(P_dict[key]) + "\t")
            if P[key] == [None, None]:
                outFile.write("null\tnull\n")
            else:
                outFile.write(
                    str(P_dict[P[key][0]]) + "\t" + str(P_dict[P[key][1]]) +
                    "\n")

        outFile.write("\nPARASITENAMES\n")
        for key in P:
            outFile.write(str(P_dict[key]) + "\t" + key + "\n")

        outFile.write("\nPHI\n")
        for key in phi:
            outFile.write(
                str(H_dict[phi[key]]) + "\t" + str(P_dict[key]) + "\n")

        outFile.close()
def run_test(fileName, max_k):
    cache_dir = './cache'
    D = 2.
    T = 3.
    L = 1.

    print >> sys.stderr, "FILE: ", fileName
    print fileName

    host, paras, phi = newickFormatReader.getInput(fileName)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        f = open('%s/README' % cache_dir, 'w')
        f.write('This directory holds a cache of reconciliation graph for the TreeLife data set')
        f.close()

    cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1])
    recon_count_location = '%s/%s.count' % (cache_dir, os.path.split(fileName)[1])
    if not(os.path.isfile(cache_location)) or not(os.path.isfile(recon_count_location)):
        print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file'
        print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location

        DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L)
        f = open(cache_location, 'w+')
        g = open(recon_count_location, 'w+')
        f.write(repr(DictGraph))
        g.write(str(numRecon))
        f.close()
        g.close()

    print >> sys.stderr, 'Loading reonciliation graph from cache'
    f = open(cache_location)
    g = open(recon_count_location)
    DictGraph = eval(f.read())
    numRecon = float(g.read())
    f.close()
    g.close()

    
    
    ## Only consider running algorithm for reconciliations with more than 
    # threshold MPRs
    if (numRecon < recon_threshold):
        print >> sys.stderr, 'Too few reconciliations: ', numRecon
        return 
    else:
        print >> sys.stderr, 'Reconciliation Count: ', numRecon



    scoresList, dictReps = Greedy.Greedy(DictGraph, paras)
    graph = ReconGraph.ReconGraph(DictGraph)
    representatives = [ReconGraph.dictRecToSetRec(graph, dictReps[0])]

    ## Debug info
    ## Modifies the graph 
    ## Checking for the case when there is an error in likelihood 
    print >> sys.stderr, "== Checking for likelihoods over 1 =="
    found = False 
    for key in DictGraph.keys():
        children = DictGraph[key]
        for child in children[:-1]:
            if child[-1] > 1:
                # Attempt to round to fix large float math errors
                roundedValue = round(child[-1])
                if roundedValue != 1.0:
                    print >> sys.stderr, "ERR FOUND: ", key, child 
                    found = True 
                
    if not(found):
        print >> sys.stderr, "NO ERR(s)"
    print >> sys.stderr, "== End of over 1 checks. =="



    print >> sys.stderr, 'Starting K-centers algorithm ... '
    for i in xrange(2, max_k + 2):
        d, newrep = maximize(graph,representatives)
        if not all(d_i > 0 for d_i in d):
            print >> sys.stderr, "Distance vector contains 0", d 
            break

        print i-1, min(d),
        representatives.append(newrep)
        dist_sum = 0
        n = 10
        for _ in xrange(n):
            reps = [KMeans.get_weighted_template(graph) for _ in xrange(i-1)]
            dist_sum += min_d(maximize(graph,reps))
        print float(dist_sum) / n

    print  >> sys.stderr, "Finished k centers algorithm ..."
def main():

	if not os.path.exists("treeFiles"):
		os.mkdir("treeFiles")	
	
	for i in xrange(6000):

		index = str(i + 1)

		for j in xrange(4 - len(str(i + 1))):
			index = "0" + index

		inFile = "real-100taxa/COG" + index + ".newick"

		if not os.path.isfile(inFile):
			continue

		outFile = open("treeFiles/COG" + index + ".tree", 'w')

		host, parasite, phi = newickFormatReader.getInput(inFile)
		H = treeFormat(host)
		P = treeFormat(parasite)

		H_dict = {}   # name:index
		P_dict = {}   # name:index

		count = 0
		for key in H:
			count += 1
			H_dict[key] = count

		for key in P:
			count += 1
			P_dict[key] = count

		outFile.write("HOSTTREE\n")
		for key in H:
			outFile.write(str(H_dict[key]) + "\t")
			if H[key] == [None, None]:
				outFile.write("null\tnull\n")
			else:
				outFile.write(str(H_dict[H[key][0]]) + "\t" + str(H_dict[H[key][1]]) + "\n")

		outFile.write("\nHOSTNAMES\n")
		for key in H:
			outFile.write(str(H_dict[key]) + "\t" + key + "\n")

		outFile.write("\nPARASITETREE\n")	
		for key in P:
			outFile.write(str(P_dict[key]) + "\t")
			if P[key] == [None, None]:
				outFile.write("null\tnull\n")
			else:
				outFile.write(str(P_dict[P[key][0]]) + "\t" + str(P_dict[P[key][1]]) + "\n")

		outFile.write("\nPARASITENAMES\n")
		for key in P:
			outFile.write(str(P_dict[key]) + "\t" + key + "\n")

		outFile.write("\nPHI\n")
		for key in phi:
			outFile.write(str(H_dict[phi[key]]) + "\t" + str(P_dict[key]) + "\n")

		outFile.close()