def dist_tree(training_distances, training_labels, outpath = "/tmp/training_distances_avg_tree"): import jrs_labels_tree print "","building label vs label distance matrix using averaging..." #dmatrix = jrs_io.load_data(open("/tmp/dist_tree_dmatrix.txt"), cast_method=float) #print "","loaded matrix =",len(dmatrix),"x",len(dmatrix[0]),"..." #import numpy #print numpy.array(dmatrix) print "","training on matrix =",len(training_distances),"x",len(training_distances[0]),"/",len(training_labels)," labels' sets..." dmatrix = jrs_labels_tree.build_sim_matrix_labels(training_distances, training_labels) print "","clearing diagonal..." for i in xrange(len(dmatrix)): dmatrix[i][i] = 0.0 minval = min_matrix(dmatrix) maxval = max_matrix(dmatrix) print numpy.array(dmatrix) print "","minval:",minval print "","maxval:",maxval jrs_io.store_data(open("/tmp/dist_tree_dmatrix.txt","w"), dmatrix) print "","transforming dmatrix by substracting minval" dmatrix_t = [[(e-minval) for e in row ] for row in dmatrix ] dmatrix = dmatrix_t minval = min_matrix(dmatrix) maxval = max_matrix(dmatrix) print numpy.array(dmatrix) print "","minval:",minval print "","maxval:",maxval jrs_io.store_data(open("/tmp/dist_tree_dmatrix_t.txt","w"), dmatrix) #dmatrix = jrs_io.load_data(open("/tmp/dist_tree_dmatrix_t.txt"), cast_method=float) #print "","loaded matrix =",len(dmatrix),"x",len(dmatrix[0]),"..." #import numpy #print numpy.array(dmatrix) treelabels = [str(i+1) for i in range(len(single_labels))] print "","treelabels:", treelabels def gen_tree(outph, hd): phylo_tree = jrs_labels_tree.upgma(treelabels, dmatrix, agreggation_method = 'a', anonclades = hd) print "","writing tree to:",outph jrs_labels_tree.write_tree(phylo_tree, outph) print "","dict_tree=",jrs_labels_tree.phylotree2dicttree(phylo_tree) gen_tree(outpath, False) gen_tree(outpath+"_hd", True)
print "Shuffling labels..." labels_shuffled = [labels[ix] for ix in order] jrs_io.store_labels(open(labels_path+"_shuffled","w"), labels_shuffled) print "Loading distances' file:", distance_matrix_path distances = jrs_io.load_data(open(distance_matrix_path), lambda x: x) try: print "",len(distances), "x",len(distances[0]) except: pass print "Extending order..." order = order + range(n, len(distances)) print "Extended order:", order print "Shuffling columns" distances_tmp = [] for row in distances: new_row = [ row[ix] for ix in order ] distances_tmp.append(new_row) print "Shuffling rows" distances_shuffled = [] for ix in order: distances_shuffled.append(distances_tmp[ix]) fout = open(distance_matrix_path+"_shuffled","w") print "Storing to ",fout jrs_io.store_data(fout, distances_shuffled)
print "Shuffling labels..." labels_shuffled = [labels[ix] for ix in order] jrs_io.store_labels(open(labels_path + "_shuffled", "w"), labels_shuffled) print "Loading distances' file:", distance_matrix_path distances = jrs_io.load_data(open(distance_matrix_path), lambda x: x) try: print "", len(distances), "x", len(distances[0]) except: pass print "Extending order..." order = order + range(n, len(distances)) print "Extended order:", order print "Shuffling columns" distances_tmp = [] for row in distances: new_row = [row[ix] for ix in order] distances_tmp.append(new_row) print "Shuffling rows" distances_shuffled = [] for ix in order: distances_shuffled.append(distances_tmp[ix]) fout = open(distance_matrix_path + "_shuffled", "w") print "Storing to ", fout jrs_io.store_data(fout, distances_shuffled)