def get_response_content(fs): # read the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # get ordered identifiers ordered_tip_name_id_pairs = list(sorted(set((node.get_name(), id(node)) for node in tree.gen_tips()))) ordered_tip_names, ordered_tip_ids = zip(*ordered_tip_name_id_pairs) ordered_internal_ids = [id(node) for node in tree.preorder() if not node.is_tip()] ordered_ids = list(ordered_tip_ids) + ordered_internal_ids # get the distance matrices full_D = tree.get_partial_distance_matrix(ordered_ids) partial_D = tree.get_partial_distance_matrix(ordered_tip_ids) # get the balaji matrices full_R = Clustering.get_R_balaji(full_D) partial_R = Clustering.get_R_balaji(partial_D) # Get the fiedler eigenvector and another eigenvector # for the full and the partial balaji matrices. full_va, full_vb = get_eigenvectors(full_R) partial_va, partial_vb = get_eigenvectors(partial_R) # create the response out = StringIO() print >> out, 'Fiedler vector associated with the graph' print >> out, 'for which the internal nodes are hidden:' print >> out, str(tuple(partial_va)) print >> out print >> out, 'The tip subvector of the Fiedler vector' print >> out, 'associated with the graph of the full tree:' print >> out, str(tuple(full_va[:len(ordered_tip_ids)])) # write the response return out.getvalue()
def run(my_map, reviews, restaurants): restaurants = Clustering.filter_restaurants(restaurants) normalized_restaurant_ids_to_topics, lda = Clustering.get_predictions( my_map, reviews, restaurants) create_topic_clusters_and_map(restaurants, normalized_restaurant_ids_to_topics, my_map, lda)
def plot_clusters(self, clustering: Clustering, title_fs=14, label_fs=12, ticks_fs=12): plt.figure(figsize=self.figsize) labels = clustering.get_labels() n_clusters = clustering.get_n_clusters() title = clustering.get_title() self.colors = sns.color_palette(palette=clustering.get_palette(), n_colors=n_clusters).as_hex() plt.scatter(self.visualized_data.values[:, 0], self.visualized_data.values[:, 1], s=30, c=[self.cluster_color(label) for label in labels], alpha=0.5) clusters_ax = plt.gca() clusters_ax.set_title(title, fontsize=title_fs) clusters_ax.set_xlabel('dim1', fontsize=label_fs) clusters_ax.set_ylabel('dim2', fontsize=label_fs) for tick in clusters_ax.xaxis.get_major_ticks( ) + clusters_ax.yaxis.get_major_ticks(): tick.label.set_fontsize(ticks_fs) plt.savefig(self.plots_dir + title)
def run_pipeline_with_pretrained_doc2vec(): all_documents = DataLoader.load_all_documents() adjacency_matrix_references_all_documents = DataLoader.load_adjacency_matrix_all_documents( ) # Load model model = DataLoader.load_model() # Runs HDBSCAN, returns a list of labels (a label for each documents. -1 == outlier) labels = Clustering.run_hdbscan(model=model, min_cluster_size=4, min_samples=4) # Extracts the documents which have been clustered such that we have no outliers # Mask denotes the ones to include and exclude. Labels of the clustered documents and the clustered documents mask, labels_subset, clustered_documents = Clustering.extract_clustered_documents( all_documents, labels) # Creates the adjacency matrix for references between clusters cluster_references_adjacency = Clustering.create_adjacency_matrix_for_clusters( mask=mask, labels=labels_subset, adjacency_references_all_documents= adjacency_matrix_references_all_documents) # k-nearest undirected adjacency cluster_references_adjacency = Clustering.make_adjacency_matrix_undirected( cluster_references_adjacency, k=3) DataLoader.save_data(cluster_references_adjacency, clustered_documents, labels_subset) # Creates the graph and sets up the interactive webpage showing the graph visualization.doc_to_vec_visualize(documents=clustered_documents, adj_matrix=cluster_references_adjacency, labels=labels_subset)
def get_response_content(fs): # read the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # get ordered identifiers ordered_tip_name_id_pairs = list( sorted(set((node.get_name(), id(node)) for node in tree.gen_tips()))) ordered_tip_names, ordered_tip_ids = zip(*ordered_tip_name_id_pairs) ordered_internal_ids = [ id(node) for node in tree.preorder() if not node.is_tip() ] ordered_ids = list(ordered_tip_ids) + ordered_internal_ids # get the distance matrices full_D = tree.get_partial_distance_matrix(ordered_ids) partial_D = tree.get_partial_distance_matrix(ordered_tip_ids) # get the balaji matrices full_R = Clustering.get_R_balaji(full_D) partial_R = Clustering.get_R_balaji(partial_D) # Get the fiedler eigenvector and another eigenvector # for the full and the partial balaji matrices. full_va, full_vb = get_eigenvectors(full_R) partial_va, partial_vb = get_eigenvectors(partial_R) # create the response out = StringIO() print >> out, 'Fiedler vector associated with the graph' print >> out, 'for which the internal nodes are hidden:' print >> out, str(tuple(partial_va)) print >> out print >> out, 'The tip subvector of the Fiedler vector' print >> out, 'associated with the graph of the full tree:' print >> out, str(tuple(full_va[:len(ordered_tip_ids)])) # write the response return out.getvalue()
def gap(X, X_pred, data, met, pjname, numS, ks, bound=(0.1,0.1)): # Generate reference dist of the origianl dataset nrefs = 10 shape = data.shape refs = None if refs==None: tops = data.max(axis=0) bots = data.min(axis=0) dists = scipy.matrix(scipy.diag(tops-bots)) rands = scipy.random.random_sample(size=(shape[0],shape[1],nrefs)) for i in range(nrefs): rands[:,:,i] = rands[:,:,i]*dists+bots else: rands = refs # Calculate gap for each k res = [] gaps = scipy.zeros((len(ks),)) stds = np.zeros((len(ks),)) gaps_d = scipy.zeros((len(ks),)) stds_d = np.zeros((len(ks),)) for (i,k) in enumerate(ks): print 'Start analysis on k=', k kmc, kml = Clustering.clustering(k, X, data, met) # Added procedure : calculate GP mean and var rather than cluster centers # Calculate distance by likelihood of Gaussian rather than Euclidean disp_d = sum([dst(data[m,:],kmc[kml[m],:]) for m in range(shape[0])]) subname = 'K'+'{:02d}'.format(k) np.savetxt(pjname+'/'+subname+'_labels.txt', kml, fmt='%s', delimiter='\n') kmc, kmstd, kmc_long, kmstd_long, p = calCent(X, X_pred, data, pjname, subname, kml, numS, k, bound) res.append((kmc, kmstd, kmc_long, kmstd_long, p, kml)) disp = sum([calDist_long(data[m,:], X, X_pred, kmc_long[kml[m],:], kmstd_long[kml[m],:], numS) for m in range(shape[0])]) # Calculate reference gap refdisps = scipy.zeros((rands.shape[2],)) refdisps_d = scipy.zeros((rands.shape[2],)) for j in range(rands.shape[2]): kmc, kml = Clustering.clustering(k, X, rands[:,:,j], met) refdisps_d[j] = sum([dst(rands[m,:,j],kmc[kml[m],:]) for m in range(shape[0])]) subname = 'R'+'{:02d}'.format(k) kmc, kmstd, kmc_long, kmstd_long, p = calCent(X, X_pred, rands[:,:,j], pjname, subname, kml, numS, k, bound) refdisps[j] = sum([calDist_long(rands[m,:,j], X, X_pred, kmc_long[kml[m],:], kmstd_long[kml[m],:], numS) for m in range(shape[0])]) gaps[i] = scipy.log(disp)-scipy.mean(scipy.log(refdisps)) stds[i] = np.std(scipy.log(refdisps))*np.sqrt(1+1/float(nrefs)) gaps_d[i] = scipy.mean(scipy.log(refdisps_d))-scipy.log(disp_d) stds_d[i] = np.std(scipy.log(refdisps_d))*np.sqrt(1+1/float(nrefs)) #print 'Gap(GP) for', k, 'is', scipy.log(disp), '-', scipy.mean(scipy.log(refdisps)), '=', gaps[i] #print 'Gap(ED) for', k, 'is', scipy.log(disp_d), '-', scipy.mean(scipy.log(refdisps_d)), '=', gaps_d[i] # Find the optimal k by std of log(refdisps) opt_i = optK2(ks, gaps, stds) opt_id = optK2(ks, gaps_d, stds_d) # Visualize gap statistics plt.errorbar(ks, gaps, yerr=stds) plt.errorbar(ks, gaps_d, yerr=stds_d) plt.savefig(pjname+'/GapStatistics.png') plt.close() pickle.dump((gaps, stds, gaps_d, stds_d), open(pjname+'/gaps.dump', 'w')) c, std, c_long, std_long, p, labels = res[opt_i] return c, std, c_long, std_long, p, ks[opt_i], labels
def process_tree(tree, tree_name, show_newick, show_art): """ @param tree: a FelTree to be split by each method @param tree_name: a description of the tree @param show_newick: an output option @param show_art: an output option @return: a multi-line output string """ out = StringIO() # be verbose if requested if show_newick: print >> out, 'newick representation of %s:' % tree_name print >> out, Newick.get_narrow_newick_string(tree, 80) if show_art: print >> out, 'ascii art representation of %s:' % tree_name print >> out, get_art(tree) # cut the tree using each method ordered_names = list(sorted(node.get_name() for node in tree.gen_tips())) n = len(ordered_names) D = tree.get_distance_matrix(ordered_names) splitters = (Clustering.StoneExactDMS(), Clustering.StoneSpectralSignDMS()) splitter_names = ('the +1 / -1 split criterion', 'the fiedler criterion') for splitter, splitter_name in zip(splitters, splitter_names): small_index_selection = splitter.get_selection(D) big_index_selection = set(range(n)) - small_index_selection names_a = list(sorted(ordered_names[i] for i in small_index_selection)) names_b = list(sorted(ordered_names[i] for i in big_index_selection)) print >> out, 'split inferred by %s:' % splitter_name print >> out, '{{%s}, {%s}}' % (', '.join(names_a), ', '.join(names_b)) # return the string return out.getvalue()
def run_clustering_city(filepath, filename, k, eps, latitude, longitude): """ The function clusters data for a given city and draws the result obtained on the map. :param filepath: path of file .csv :param filename: name of file .csv :param k: the value of k :param eps: the value of eps :param latitude: latitude of city :param longitude: longitude of city :return: None """ d = Cluster.ClusterGreatCircles(filepath, filename) for k in [7]: for eps in [50]: c = Clustering.K_MXTGreatCircle(eps, k, d) c() m = Metrics.Modularity(c) print(f'k-MXT k={k} eps={eps} Modularity={m()}') c.cluster.view_at_map(latitude=latitude, longitude=longitude, filename_of_map=f'{k}-MXT-eps{eps}') c = Clustering.K_MXTGaussGreatCircle(eps, k, d) c() c.cluster.view_at_map(latitude=latitude, longitude=longitude, filename_of_map=f'{k}-MXTGauss-eps{eps}') m = Metrics.Modularity(c) print(f'k-MXT-Gauss k = {k} eps = {eps} Modularity = {m()}')
def getClusteringEvalPlots(dataset): noOfClusters = range(2, 11, 1) for ds in dataset: sse = [[]] sil = [[[], []]] scores = [[[], []], [[], []], [[], []], [[], []], [[], []]] for cluster in noOfClusters: kmLearner = Clustering.KM(n_clusters=cluster) kmLearner.getLearner().fit(ds.training_x) emLearner = Clustering.EM(n_components=cluster) emLearner.getLearner().fit(ds.training_x) clustringY_KM = kmLearner.getLearner().predict(ds.training_x) clustringY_EM = emLearner.getLearner().predict(ds.training_x) homogeneityKM, completenessKM, v_measureKM = homogeneity_completeness_v_measure(ds.training_y, clustringY_KM) AMISKM = adjusted_mutual_info_score(ds.training_y, clustringY_KM) ARSKM = adjusted_rand_score(ds.training_y, clustringY_KM) silhouetteKM = silhouette_score(ds.training_x, clustringY_KM) homogeneityEM, completenessEM, v_measureEM = homogeneity_completeness_v_measure(ds.training_y, clustringY_EM) AMISEM = adjusted_mutual_info_score(ds.training_y, clustringY_EM) ARSEM = adjusted_rand_score(ds.training_y, clustringY_EM) silhouetteEM = silhouette_score(ds.training_x, clustringY_EM) sse.append(kmLearner.getLearner().inertia_) sil[0][0].append(silhouetteKM) scores[0][0].append(v_measureKM) scores[1][0].append(AMISKM) scores[2][0].append(ARSKM) scores[3][0].append(homogeneityKM) sil[0][1].append(silhouetteEM) scores[0][1].append(v_measureEM) scores[1][1].append(AMISEM) scores[2][1].append(ARSEM) scores[3][1].append(homogeneityEM) plt.style.use('seaborn-whitegrid') plt.plot(noOfClusters, sil[0][0], label='Silhouette Score, KM', marker='o') plt.plot(noOfClusters, sil[0][1], label='Silhouette Score, EM', marker='o', linestyle='--') plt.ylabel('Silhouette Score', fontsize=12) plt.xlabel('K', fontsize=12) plt.title('Silhouette Plot for ' + ds.name, fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Clustering/Silhouette for ' + ds.name + '.png') plt.close() plt.style.use('seaborn-whitegrid') plt.plot(noOfClusters, scores[0][0], label='V Measure, KM', marker='o') plt.plot(noOfClusters, scores[1][0], label='Adj. Mutual Info, KM', marker='o') plt.plot(noOfClusters, scores[2][0], label='Adj. Rand. Score, KM', marker='o') plt.plot(noOfClusters, scores[0][1], label='V Measure, EM', marker='o', linestyle='--') plt.plot(noOfClusters, scores[1][1], label='Adj. Mutual Info, EM', marker='o', linestyle='--') plt.plot(noOfClusters, scores[2][1], label='Adj. Rand. Score, EM', marker='o', linestyle='--') plt.ylabel('Score', fontsize=12) plt.xlabel('K', fontsize=12) plt.title('Score Plot for ' + ds.name, fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Clustering/Score for ' + ds.name + '.png') plt.close()
def main(): prefix_500 = "500" prefix_1000 = "1000" prefix_2000 = "2000" suffix_start_count = 1 suffix_end_count = 10 design_kmer_list = kmers.get_design_kmers() use_cluster_size_hard_stop = False for suffix_count in range(suffix_start_count, suffix_end_count + 1): input_file_name = prefix_500 + '_' + str(suffix_count) sequence_kmer_list = kmers.get_sequence_kmers(input_file_name) cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list, design_kmer_list, use_cluster_size_hard_stop) pickle.dump( cluster_dict, open("clusters_no_hard_stop" + input_file_name + ".p", "wb")) consensus_kmer_list = cluster_dict.keys() consensus_mapping_list = get_bipartite_matching( consensus_kmer_list, design_kmer_list) output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list, cluster_dict) for suffix_count in range(suffix_start_count, suffix_end_count + 1): input_file_name = prefix_1000 + '_' + str(suffix_count) sequence_kmer_list = kmers.get_sequence_kmers(input_file_name) cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list, design_kmer_list, use_cluster_size_hard_stop) pickle.dump( cluster_dict, open("clusters_no_hard_stop" + input_file_name + ".p", "wb")) consensus_kmer_list = cluster_dict.keys() consensus_mapping_list = get_bipartite_matching( consensus_kmer_list, design_kmer_list) output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list, cluster_dict) for suffix_count in range(suffix_start_count, suffix_end_count + 1): input_file_name = prefix_2000 + '_' + str(suffix_count) sequence_kmer_list = kmers.get_sequence_kmers(input_file_name) cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list, design_kmer_list, use_cluster_size_hard_stop) pickle.dump( cluster_dict, open("clusters_no_hard_stop" + input_file_name + ".p", "wb")) consensus_kmer_list = cluster_dict.keys() consensus_mapping_list = get_bipartite_matching( consensus_kmer_list, design_kmer_list) output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list, cluster_dict)
def get_response_content(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # read the criterion string, creating the splitter object if fs.exact: splitter = Clustering.StoneExactDMS() elif fs.sign: splitter = Clustering.StoneSpectralSignDMS() elif fs.nj: splitter = Clustering.NeighborJoiningDMS() elif fs.random: splitter = Clustering.RandomDMS() # read the original tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # define the maximum number of steps we want max_steps = 1000000 # Make sure that the splitter object is appropriate # for the number of taxa and the number of tree reconstructions. ntaxa = len(list(tree.gen_tips())) if splitter.get_complexity(ntaxa) * fs.iterations > max_steps: msg_a = 'use a faster bipartition function, ' msg_b = 'fewer taxa, or fewer tree reconstructions' raise HandlingError(msg_a + msg_b) # define the simulation parameters sim = Simulation(splitter, 'nj', 'cgi tree building simulation') sim.set_original_tree(tree) sim.set_step_limit(max_steps) # define an arbitrary but consistent ordering of the taxa ordered_names = [node.name for node in tree.gen_tips()] # attempt to simulate a bunch of distance matrices sampler = DMSampler.DMSampler(tree, ordered_names, fs.length) distance_matrices = [] for result in sampler.gen_samples_or_none(): # if a proposal was accepted then add it to the list if result: sequence_list, distance_matrix = result distance_matrices.append(distance_matrix) # if enough accepted samples have been generated then stop sampling remaining_acceptances = fs.iterations - len(distance_matrices) if not remaining_acceptances: break # If the remaining number of computrons is predicted # to be too much then stop. if sampler.get_remaining_computrons(remaining_acceptances) > max_steps: msg_a = 'this combination of parameters ' msg_b = 'is predicted to take too long' raise HandlingError(msg) sim.run(distance_matrices, ordered_names) # define the response out = StringIO() print >> out, 'partition error count frequencies:' print >> out, sim.get_histogram_string() print >> out, '' print >> out, 'weighted partition errors:', sim.get_deep_loss() # return the response return out.getvalue()
def create_topic_clusters_and_map(restaurants, restaurant_ids_to_topics, my_map, lda, use_human_labels=True): data = Clustering.create_data_array(restaurants, restaurant_ids_to_topics, my_map) Clustering.plot_clusters(my_map, restaurants, restaurant_ids_to_topics, data, lda)
def get_response_content(fs): # read the matrix D = fs.matrix if len(D) < 3: raise HandlingError('the matrix should have at least three rows') # read the ordered labels ordered_labels = Util.get_stripped_lines(StringIO(fs.labels)) if len(ordered_labels) != len(D): msg_a = 'the number of ordered labels should be the same ' msg_b = 'as the number of rows in the matrix' raise HandlingError(msg_a + msg_b) if len(set(ordered_labels)) != len(ordered_labels): raise HandlingError('the ordered labels must be unique') # read the criterion string, creating the splitter object if fs.exact: splitter = Clustering.StoneExactDMS() elif fs.sign: splitter = Clustering.StoneSpectralSignDMS() elif fs.threshold: splitter = Clustering.StoneSpectralThresholdDMS() elif fs.nj: splitter = Clustering.NeighborJoiningDMS() # Make sure that the splitter object # is appropriate for the size of the distance matrix. if splitter.get_complexity(len(D)) > 1000000: msg = 'use a smaller distance matrix or a faster bipartition function' raise HandlingError(msg) # read the original tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) if len(ordered_labels) != len(list(tree.gen_tips())): msg_a = 'the number of ordered labels should be the same ' msg_b = 'as the number of tips in the tree' raise HandlingError(msg_a + msg_b) tree_tip_names = set(tip.name for tip in tree.gen_tips()) if tree_tip_names != set(ordered_labels): msg_a = 'the leaf labels of the tree do not match ' msg_b = 'the ordered labels of the distance matrix rows' raise HandlingError(msg_a + msg_b) # create the tree builder tree_builder = NeighborhoodJoining.ValidatingTreeBuilder( D.tolist(), ordered_labels, splitter) # Read the recourse string and set the corresponding method # in the tree builder. if fs.njrecourse: tree_builder.set_fallback_name('nj') elif fs.halvingrecourse: tree_builder.set_fallback_name('halving') # define the response out = StringIO() # set parameters of the tree validating tree builder tree_builder.set_original_tree(tree) tree_builder.set_output_stream(out) tree = tree_builder.build() # return the response return out.getvalue()
def do_hard_coded_analysis_a(tree, tree_remark): """ Do a hardcoded analysis of tree reconstruction methods. Make a bunch of R files. @param tree: a tree object @param tree_remark: a string that is a comment about the tree """ # define an arbitrary order for the names of the leaves of the tree ordered_names = list(node.name for node in tree.gen_tips()) # use 1000 replicates reconstruction_count = 1000 # Make R files for reconstruction results # from sequences 100 and 500 nucleotides long. for sequence_length in (100, 500): # sample distance matrices print 'sampling', reconstruction_count, 'distance matrices' print 'from alignments of length', sequence_length sampler = DMSampler.DMSampler(tree, ordered_names, sequence_length) distance_matrices = [] for result in sampler.gen_samples_or_none(): # if the proposal was rejected then try again if not result: continue # add the accepted distance matrix sample to the list sequence_list, distance_matrix = result distance_matrices.append(distance_matrix) # stop when we have generated enough distance matrices if len(distance_matrices) == reconstruction_count: break # run both neighbor joining and spectral sign clustering sims = [ Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'), Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign') ] for sim in sims: print 'reconstructing', len(distance_matrices), 'trees' print 'using', sim.description sim.set_original_tree(tree) sim.run(distance_matrices, ordered_names) # consider the neighbor joining and the spectral sign results nj_sim, ss_sim = sims # write the uniform loss function comparison R script script_contents = R_helper(nj_sim.get_normalized_error_counts(), ss_sim.get_normalized_error_counts()) filename = 'uniform_%d.R' % sequence_length with open(filename, 'w') as fout: print >> fout, script_contents # write the weighted loss function comparison R script script_contents = R_helper(nj_sim.get_normalized_loss_values(), ss_sim.get_normalized_loss_values()) filename = 'weighted_%d.R' % sequence_length with open(filename, 'w') as fout: print >> fout, script_contents
def plotClusterWordCloudArray(articles, articleCentroidIds, Ks): fig, axes = plt.subplots(Ks.shape[0], Ks.shape[1], figsize=(12, 12)) for i in range(Ks.shape[0]): for j in range(Ks.shape[1]): axes[i, j].imshow( createWordCloud( Clustering.concatinateClusterTexts(articles, articleCentroidIds, Ks[i, j]))) axes[i, j].axis("off") axes[i, j].set_title("Cluster " + str(Ks[i, j]) + "; count=" + str( Clustering.countClusterArticles(articles, articleCentroidIds, Ks[i, j])))
def create_clustering(documents, config): """Returns a clustering of the documents.""" cluster_alg = config.get('clustering', 'cluster_algorithm') #start dbscan and infer the options if cluster_alg == 'dbscan' and not config.has_option('clustering','cluster_options'): clusters = Clustering.create_clusters_dbscan_infer_options(documents) #use given options else: cluster_options = json.loads(config.get('clustering','cluster_options')) clusters = Clustering.create_clusters(documents, cluster_alg, cluster_options) return clusters
def initNodeLearningParams(self, AlgorithmChoice, AlgParams): self.AlgorithmChoice = AlgorithmChoice if AlgorithmChoice == 'Clustering': CentsPerLayer = AlgParams['NumCentsPerLayer'] # InputWidth = InputWidths[LayerNum] if self.LayerNumber == 0: InputWidth = 48 else: InputWidth = CentsPerLayer[self.LayerNumber - 1] * 4 self.LearningAlgorithm = Clustering( AlgParams['mr'], AlgParams['vr'], AlgParams['sr'], InputWidth, AlgParams['NumCentsPerLayer'][self.LayerNumber], self.NodePosition) else: print('Only Incremental Clustering Exists')
def main(): xlxs_to_maps = XlxsToMapsWrapper() xlxs_to_maps.getLatLngData() # Compute the cost of all Schools from all centers for clustering get_and_store_cost = GetAndStoreCost() get_and_store_cost.compute_cost_based_on_saved_text_file() # get_and_store_cost.compute_costs() # get_and_store_cost.compute_costs_thread_pool() # Identify the high level clusters clustering = Clustering() clustering.high_level_clustering() return
def create_clustering(documents, config): """Returns a clustering of the documents.""" cluster_alg = config.get('clustering', 'cluster_algorithm') #start dbscan and infer the options if cluster_alg == 'dbscan' and not config.has_option( 'clustering', 'cluster_options'): clusters = Clustering.create_clusters_dbscan_infer_options(documents) #use given options else: cluster_options = json.loads( config.get('clustering', 'cluster_options')) clusters = Clustering.create_clusters(documents, cluster_alg, cluster_options) return clusters
def gold_ratio(browser): cee_result = Cee.color_element_from_url(browser) cl_list = [] color_size_dict = {} for ce in cee_result: text_size_dict = ce[1].size cl_list.append(color_to_tuple(ce[2])) text_area = text_size_dict['height'] * text_size_dict['width'] color_size_dict[color_to_tuple(ce[2])] = text_area back_size_dict = ce[3].size cl_list.append(color_to_tuple(ce[4])) back_area = back_size_dict['height'] * back_size_dict['width'] color_size_dict[color_to_tuple(ce[4])] = back_area clust_result = Clust.Clustering(cl_list, 3) color_ratio = [] for cr in clust_result: size_sum = 0 for t in cr: size_sum += color_size_dict[t] color_ratio.append(size_sum) color_ratio.sort(reverse=True) # print(color_ratio) b_ratio = color_ratio[0] / color_ratio[2] s_ratio = color_ratio[1] / color_ratio[2] golden = (b_ratio - 6) / 9 + (s_ratio - 3) / 9 return 1 - abs(golden)
def get_response_content(fs): # get the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # get the selected names selection = Util.get_stripped_lines(fs.selection.splitlines()) selected_name_set = set(selection) possible_name_set = set(node.get_name() for node in tree.gen_tips()) extra_names = selected_name_set - possible_name_set if extra_names: msg_a = 'the following selected names ' msg_b = 'are not valid tips: %s' % str(tuple(extra_names)) raise HandlingError(msg_a + msg_b) complement_name_set = possible_name_set - selected_name_set # assert that neither the selected name set nor its complement is empty if not selected_name_set or not complement_name_set: raise HandlingError('the selection is degenerate') # define an ordering on the tips ordered_names = [node.get_name() for node in tree.gen_tips()] # convert the selected names to a Y vector Y_as_list = [] for name in ordered_names: if name in selected_name_set: value = 1 else: value = -1 Y_as_list.append(value) Y = np.array(Y_as_list) # get the distance matrix D = tree.get_distance_matrix(ordered_names) # get the R matrix R = Clustering.get_R_balaji(D) value = np.dot(np.dot(Y, R), Y.T) # return the taxon split evaluation return str(value) + '\n'
def data_preprosessing(self, method='MeanShirf'): new_column = [] if method == 'Uniform Distribution': for column in range(len(self.header_sublist)): self.new_data_frame[self.header_sublist[ column]] = self.data_preprosessing_uniform_distribution( self.header_sublist[column], 8) elif method == 'Equal Steps': self.data_preprosessing_equal_range() else: #'MeanShirf' for column in self.header_sublist: labled_column, cluster_centers = Clustering.meanShift( self.data_frame[column].values) ##################################################################################### # we need cluster_centers for future prediction and clustering of a new song # ##################################################################################### self.new_data_frame[column] = labled_column ##################################### for i in range(len(self.data_frame['song_popularity'])): if self.data_frame['song_popularity'][i] < 20: new_column.append(0) elif self.data_frame['song_popularity'][i] < 40: new_column.append(1) elif self.data_frame['song_popularity'][i] < 60: new_column.append(2) elif self.data_frame['song_popularity'][i] < 80: new_column.append(3) else: new_column.append(4) self.new_data_frame['song_popularity'] = new_column self.new_data_frame.to_csv(self.output_file_name)
def do_first_method(subtree_a, subtree_b, taxa_a1, taxa_a2, taxa_b1, taxa_b2, connecting_branch_length, out): # define the branch lengths of the reduced tree blen_a1, blen_a2, blen_ar = get_branch_length_equivalents( subtree_a, taxa_a1, taxa_a2) blen_b1, blen_b2, blen_br = get_branch_length_equivalents( subtree_b, taxa_b1, taxa_b2) # define the distance matrix of the reduced tree a, b = blen_a1, blen_a2 c, d = blen_b1, blen_b2 e = connecting_branch_length + blen_ar + blen_br reduced_D = [ [0, a+b, a+e+c, a+e+d], [b+a, 0, b+e+c, b+e+d], [c+e+a, c+e+b, 0, c+d], [d+e+a, d+e+b, d+c, 0]] # get the R matrix of the reduced tree reduced_R = Clustering.get_R_balaji(reduced_D) print >> out, 'first method:' print >> out, 'equivalent subtree A:', '(a1:%f, a2:%f);' % (a, b) print >> out, 'equivalent subtree B:', '(b1:%f, b2:%f);' % (c, d) print >> out, 'equivalent connecting branch length:', e print >> out, 'M for the equivalent tree:' print >> out, MatrixUtil.m_to_string(reduced_R) print >> out
def trainData(data): path = os.getcwd() class1 = Clustering.cluster(path) data_seg = class1.preTreating(inptdata=data) for i in range(0, len(data_seg)): data_seg[i] = data_seg[i].split(" ") return data_seg
def main(): parser = OptionParser() parser.add_option("-t", "--test", dest="test", default=False, action="store_true", help="Test the given set with the centroids of clusters") parser.add_option("-s", "--save", dest="saveCentroids", default=False, action="store_true", help="Save the centroids of each cluster in a file") parser.add_option("-f", "--force", dest="forceMatrix", default=False, action="store_true", help="Creates similarity matrix even if it exists") (options, args) = parser.parse_args() start_time = time.time() files = os.listdir(SPN_PATH) if os.path.isfile(MATRIX_FILE): #Si no existe la matrix de correlaciones la genero H = np.load(MATRIX_FILE) else: H = Create_Similarity_Matrix(files) print 'Trabajando con matriz.... '+MATRIX_FILE print 'Tiempo de creación de matrix: ', time.time() - start_time Clusters = clr.HierarchicalClustering(H, SIMILARITY_MSR) if options.test: Centroids = GetClusterCentroids(Clusters,files,options.saveCentroids) SimpleClustering(Clusters,Centroids,files) Eval = EvaluateClusters(Clusters,files) SaveOutput(Clusters,Eval,files) print "Numero total de clusters/camaras = "+str(len(Clusters)) print time.time() - start_time, "seconds"
def get_response_content(fs): # get the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # assert the the given labels are tips of the tree tip_name_set = set(node.get_name() for node in tree.gen_tips()) user_name_set = set([fs.lhs_a, fs.lhs_b, fs.rhs_a, fs.rhs_b]) bad_names = user_name_set - tip_name_set if bad_names: msg = 'these labels are not valid tips: %s' % ', '.join(bad_names) raise HandlingError(msg) # get the submatrix of the distance matrix ordered_names = list(sorted(node.get_name() for node in tree.gen_tips())) D = np.array(tree.get_distance_matrix(ordered_names)) # get the response matrix R = Clustering.get_R_stone(D) # get the two by two matrix name_to_index = dict((name, i) for i, name in enumerate(ordered_names)) R_reduced = np.zeros((2, 2)) la = name_to_index[fs.lhs_a] lb = name_to_index[fs.lhs_b] ra = name_to_index[fs.rhs_a] rb = name_to_index[fs.rhs_b] R_reduced[0][0] = R[la][ra] R_reduced[0][1] = R[la][rb] R_reduced[1][0] = R[lb][ra] R_reduced[1][1] = R[lb][rb] epsilon = 1e-13 criterion = np.linalg.det(R_reduced) if abs(criterion) < epsilon: criterion = 0 # in analogy to the four point condition, use two different ways of calculating the distance blen_a = (D[la][rb] + D[lb][ra] - D[la][lb] - D[ra][rb]) / 2.0 blen_b = (D[la][ra] + D[lb][rb] - D[la][lb] - D[ra][rb]) / 2.0 blen = min(blen_a, blen_b) # define the response out = StringIO() paragraphs = [] if fs.show_response: paragraph = [ 'response matrix with rows ordered alphabetically by leaf label:', MatrixUtil.m_to_string(R) ] paragraphs.append(paragraph) if fs.show_reduced_response: paragraph = [ '2x2 submatrix of the response matrix:', MatrixUtil.m_to_string(R_reduced) ] paragraphs.append(paragraph) if True: paragraph = [ 'determinant of the 2x2 submatrix of the response matrix:', str(criterion) ] paragraphs.append(paragraph) if fs.show_blen: paragraph = ['branch length defined by the split:', str(blen)] paragraphs.append(paragraph) # return the response return '\n\n'.join('\n'.join(p) for p in paragraphs) + '\n'
def textInput(text): #test_test = "CRM宽带资源的地址在哪里" path = os.getcwd() class1 = Clustering.cluster(path) sentence = class1.preTreating(inptdata=text) sentence = sentence[0].split(" ") return sentence
def performDBSCAN(self): if (len(self.smallGoldParticleCoordinates) < 1): print 'DBSCAN cannot be performed, not enough data points' else: epsilonInPixel = self.DBSCANEpsilonDoubleSpinBox.value( ) * nmPixelRatio # perform DBSCAN array, labels = Clustering.dbscanClustering( self.smallGoldParticleCoordinates, [], [], epsilonInPixel, self.DBSCANMinSampleDoubleSpinBox.value()) # appending X and Y coordinates of the localization points to vectors to save them with the cluster IDs (labels) xVec = [] yVec = [] for i in range(0, len(array)): xVec.append(array[i][0]) yVec.append(array[i][1]) # appending vectors to a matrix dataMatrix = [] dataMatrix.append(xVec) dataMatrix.append(yVec) dataMatrix.append(labels) # transpose the matrix to have the data columnwise and save it with a '_DBSCAN' suffix dataMatrix = np.transpose(np.asarray(dataMatrix)) filenameToSave = str( openedFilename[0:-4]) + '_DBSCAN_epsilon=' + str( int(self.DBSCANEpsilonDoubleSpinBox.value()) ) + 'nm_minSample=' + str( int(self.DBSCANMinSampleDoubleSpinBox.value())) + '.txt' np.savetxt(filenameToSave, dataMatrix, fmt='%0.0f')
def target_plot(self, label: str, anomalies=False): labels = self.target[label].values n_labels = max(pd.unique(labels).max(), len(pd.unique(labels))) + 1 return Clustering(labels=labels, n_clusters=n_labels, title=label.capitalize() + ' True Labels', palette=self.palette)
def get_response_content(fs): # read the matrix D = fs.matrix if len(D) < 3: raise HandlingError('the matrix should have at least three rows') # read the ordered labels ordered_labels = Util.get_stripped_lines(fs.labels.splitlines()) if len(ordered_labels) != len(D): msg_a = 'the number of ordered labels should be the same ' msg_b = 'as the number of rows in the matrix' raise HandlingError(msg_a + msg_b) # create the tree building object splitter = Clustering.StoneExactDMS() tree_builder = NeighborhoodJoining.TreeBuilder(D.tolist(), ordered_labels, splitter) # Read the recourse string and set the corresponding method # in the tree builder. recourse_string = fs.getfirst('recourse') if fs.njrecourse: tree_builder.set_fallback_name('nj') elif fs.halvingrecourse: tree_builder.set_fallback_name('halving') # assert that the computation will not take too long if tree_builder.get_complexity() > 1000000: raise HandlingError('this computation would take too long') # build the tree tree = tree_builder.build() # return the response return NewickIO.get_newick_string(tree) + '\n'
def get_response_content(fs): # get the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # assert the the given labels are tips of the tree tip_name_set = set(node.get_name() for node in tree.gen_tips()) user_name_set = set([fs.lhs_a, fs.lhs_b, fs.rhs_a, fs.rhs_b]) bad_names = user_name_set - tip_name_set if bad_names: msg = 'these labels are not valid tips: %s' % ', '.join(bad_names) raise HandlingError(msg) # get the submatrix of the distance matrix ordered_names = list(sorted(node.get_name() for node in tree.gen_tips())) D = np.array(tree.get_distance_matrix(ordered_names)) # get the response matrix R = Clustering.get_R_stone(D) # get the two by two matrix name_to_index = dict((name, i) for i, name in enumerate(ordered_names)) R_reduced = np.zeros((2,2)) la = name_to_index[fs.lhs_a] lb = name_to_index[fs.lhs_b] ra = name_to_index[fs.rhs_a] rb = name_to_index[fs.rhs_b] R_reduced[0][0] = R[la][ra] R_reduced[0][1] = R[la][rb] R_reduced[1][0] = R[lb][ra] R_reduced[1][1] = R[lb][rb] epsilon = 1e-13 criterion = np.linalg.det(R_reduced) if abs(criterion) < epsilon: criterion = 0 # in analogy to the four point condition, use two different ways of calculating the distance blen_a = (D[la][rb] + D[lb][ra] - D[la][lb] - D[ra][rb]) / 2.0 blen_b = (D[la][ra] + D[lb][rb] - D[la][lb] - D[ra][rb]) / 2.0 blen = min(blen_a, blen_b) # define the response out = StringIO() paragraphs = [] if fs.show_response: paragraph = [ 'response matrix with rows ordered alphabetically by leaf label:', MatrixUtil.m_to_string(R)] paragraphs.append(paragraph) if fs.show_reduced_response: paragraph = [ '2x2 submatrix of the response matrix:', MatrixUtil.m_to_string(R_reduced)] paragraphs.append(paragraph) if True: paragraph = [ 'determinant of the 2x2 submatrix of the response matrix:', str(criterion)] paragraphs.append(paragraph) if fs.show_blen: paragraph = [ 'branch length defined by the split:', str(blen)] paragraphs.append(paragraph) # return the response return '\n\n'.join('\n'.join(p) for p in paragraphs) + '\n'
def __init__(self, data): path = os.getcwd() class1 = Clustering.cluster(path) result = class1.kmeans(data=data) self.clus_result = result[0] self.weight = result[1] self.tfidf_model = result[2:5] self.class1 = class1
class Node: def __init__(self, LayerNumber, NodePos, cifarstat={ 'patch_mean': [], 'patch_std': [], 'whiten_mat': [] }): self.LayerNumber = LayerNumber self.NodePosition = NodePos self.Belief = [] #cifarStat = load_cifar(4)# to be used for Normalization and Whitening Purposes self.patch_mean = cifarstat['patch_mean'] self.patch_std = cifarstat['patch_std'] self.v = cifarstat['whiten_mat'] def initNodeLearningParams(self, AlgorithmChoice, AlgParams): self.AlgorithmChoice = AlgorithmChoice if AlgorithmChoice == 'Clustering': CentsPerLayer = AlgParams['NumCentsPerLayer'] # InputWidth = InputWidths[LayerNum] if self.LayerNumber == 0: InputWidth = 48 else: InputWidth = CentsPerLayer[self.LayerNumber - 1] * 4 self.LearningAlgorithm = Clustering( AlgParams['mr'], AlgParams['vr'], AlgParams['sr'], InputWidth, AlgParams['NumCentsPerLayer'][self.LayerNumber], self.NodePosition) else: print('Only Incremental Clustering Exists') def loadInput(self, In): if self.LayerNumber == 0: In = In - self.patch_mean In = In / self.patch_std In = In.dot(self.v) self.Input = In def doNodeLearning(self, Mode): if self.AlgorithmChoice == 'Clustering': self.LearningAlgorithm.update_node(self.Input, Mode) self.Belief = self.LearningAlgorithm.belief else: print("Only Incremental Clustering Algorithm Exists")
def main(): prefix_500 = "500" prefix_1000 = "1000" prefix_2000 = "2000" suffix_start_count = 1 suffix_end_count = 10 design_kmer_list = kmers.get_design_kmers() use_cluster_size_hard_stop = False for suffix_count in range(suffix_start_count, suffix_end_count + 1): input_file_name = prefix_500 + '_' + str(suffix_count) sequence_kmer_list = kmers.get_sequence_kmers(input_file_name) cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list, design_kmer_list, use_cluster_size_hard_stop) pickle.dump(cluster_dict, open("clusters_no_hard_stop" + input_file_name + ".p", "wb")) consensus_kmer_list = cluster_dict.keys() consensus_mapping_list = get_bipartite_matching(consensus_kmer_list, design_kmer_list) output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list, cluster_dict) for suffix_count in range(suffix_start_count, suffix_end_count + 1): input_file_name = prefix_1000 + '_' + str(suffix_count) sequence_kmer_list = kmers.get_sequence_kmers(input_file_name) cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list, design_kmer_list, use_cluster_size_hard_stop) pickle.dump(cluster_dict, open("clusters_no_hard_stop" + input_file_name + ".p", "wb")) consensus_kmer_list = cluster_dict.keys() consensus_mapping_list = get_bipartite_matching(consensus_kmer_list, design_kmer_list) output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list, cluster_dict) for suffix_count in range(suffix_start_count, suffix_end_count + 1): input_file_name = prefix_2000 + '_' + str(suffix_count) sequence_kmer_list = kmers.get_sequence_kmers(input_file_name) cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list, design_kmer_list, use_cluster_size_hard_stop) pickle.dump(cluster_dict, open("clusters_no_hard_stop" + input_file_name + ".p", "wb")) consensus_kmer_list = cluster_dict.keys() consensus_mapping_list = get_bipartite_matching(consensus_kmer_list, design_kmer_list) output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list, cluster_dict)
def transform_c(L, D_partial): """ The matrices are ordered with leaves before internal nodes. @param L: full tree laplacian matrix @param D_partial: distances between leaves @return: the S matrix """ M = Clustering.get_R_stone(D_partial) S = -2*M return S
def initNodeLearningParams(self, AlgorithmChoice, AlgParams): self.AlgorithmChoice = AlgorithmChoice if AlgorithmChoice == 'Clustering': CentsPerLayer = AlgParams['NumCentsPerLayer'] # InputWidth = InputWidths[LayerNum] if self.LayerNumber == 0: InputWidth = 48 else: InputWidth = CentsPerLayer[self.LayerNumber-1] * 4 self.LearningAlgorithm = Clustering(AlgParams['mr'], AlgParams['vr'], AlgParams['sr'], InputWidth, AlgParams['NumCentsPerLayer'][self.LayerNumber], self.NodePosition) else: print('Only Incremental Clustering Exists')
def getMaxSimilarityForNJ(d, seq): Q = {} n = len(seq) for (i, j) in d: sumI = 0 sumJ = 0 for k in seq: if i != k: sumI += d[i, k] if j != k: sumJ += d[j, k] Q[i, j] = (n - 2) * d[i, j] - sumI - sumJ maxSimilarity = Clustering.getMax(Q) return maxSimilarity
class Node: def __init__(self, LayerNumber, NodePos, cifarstat={'patch_mean':[],'patch_std':[],'whiten_mat':[]}): self.LayerNumber = LayerNumber self.NodePosition = NodePos self.Belief = [] #cifarStat = load_cifar(4)# to be used for Normalization and Whitening Purposes self.patch_mean = cifarstat['patch_mean'] self.patch_std = cifarstat['patch_std'] self.v = cifarstat['whiten_mat'] def initNodeLearningParams(self, AlgorithmChoice, AlgParams): self.AlgorithmChoice = AlgorithmChoice if AlgorithmChoice == 'Clustering': CentsPerLayer = AlgParams['NumCentsPerLayer'] # InputWidth = InputWidths[LayerNum] if self.LayerNumber == 0: InputWidth = 48 else: InputWidth = CentsPerLayer[self.LayerNumber-1] * 4 self.LearningAlgorithm = Clustering(AlgParams['mr'], AlgParams['vr'], AlgParams['sr'], InputWidth, AlgParams['NumCentsPerLayer'][self.LayerNumber], self.NodePosition) else: print('Only Incremental Clustering Exists') def loadInput(self, In): if self.LayerNumber == 0: In = In - self.patch_mean In = In/self.patch_std In = In.dot(self.v) self.Input = In def doNodeLearning(self, Mode): if self.AlgorithmChoice == 'Clustering': self.LearningAlgorithm.update_node(self.Input, Mode) self.Belief = self.LearningAlgorithm.belief else: print("Only Incremental Clustering Algorithm Exists")
def get_response_content(fs): # start writing the response type response_headers = [] # get the processing options use_internal_nodes = fs.internal # read the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # get the ordered ids and ordered names of the nodes in the tree ordered_name_id_pairs = [] for node in tree.preorder(): # define the name of the node name = '' if node.is_tip(): name = node.get_name() # possibly add the node if use_internal_nodes: ordered_name_id_pairs.append((name, id(node))) elif node.is_tip(): ordered_name_id_pairs.append((name, id(node))) ordered_ids = [id_ for name, id_ in ordered_name_id_pairs] ordered_names = [name for name, id_ in ordered_name_id_pairs] #raise HandlingError('debug: ' + str(ordered_names)) id_to_index = dict((id_, i) for i, id_ in enumerate(ordered_ids)) # get the incidence matrix for drawing lines n = len(ordered_ids) incidence_matrix = [[0]*n for i in range(n)] if use_internal_nodes: for node in tree.preorder(): for child in node.gen_children(): parent_id = id_to_index[id(node)] child_id = id_to_index[id(child)] incidence_matrix[parent_id][child_id] = 1 incidence_matrix[child_id][parent_id] = 1 # get the R matrix from the tree; this is -1/2 times the laplacian matrix if use_internal_nodes: D = tree.get_full_distance_matrix(ordered_ids) else: D = tree.get_distance_matrix(ordered_names) R_matrix = Clustering.get_R_balaji(D) # draw the image try: ext = Form.g_imageformat_to_ext[fs.imageformat] image_size = (640, 480) return get_image(R_matrix, incidence_matrix, ordered_names, image_size, ext, fs.axes, fs.connections) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def get_response_content(fs): # read the weighted adjacency matrix A = fs.matrix # read the labels ordered_labels = Util.get_stripped_lines(StringIO(fs.labels)) # Assert that the number of labels # is compatible with the shape of the matrix. n = len(A) if len(ordered_labels) != n: msg = 'the number of labels does not match the size of the matrix' raise HandlingError(msg) # get the best objective function value and the corresponding best cluster if fs.conductance: max_size = 20 if n > max_size: msg_a = 'for the min conductance objective function please ' msg_b = 'limit the size of the matrix to %d rows' % max_size raise HandlingError(msg_a + msg_b) pairs = [(get_conductance(assignment, A), assignment) for assignment in Clustering.gen_assignments(n)] best_objective, best_assignment = min(pairs) best_cluster = set(i for i in range(n) if best_assignment[i] == 1) if fs.min: best_cluster = StoerWagner.stoer_wagner_min_cut(A) complement = set(range(n)) - best_cluster best_objective = sum(A[i][j] for i in best_cluster for j in complement) # get the smaller of the two clusters complement = set(range(n)) - best_cluster small_cluster = min((len(best_cluster), best_cluster), (len(complement), complement))[1] # start to prepare the reponse out = StringIO() print >> out, 'smallest cluster defined by the bipartition:' for index in sorted(small_cluster): print >> out, ordered_labels[index] print >> out, '' print >> out, 'objective function value:' print >> out, best_objective # write the response return out.getvalue()
def get_branch_length_equivalents(tree, first_taxa, second_taxa): """ @param tree: a newick tree @param first_taxa: a set of tip names @param second_taxa: another set of tip names @return: a triple (first distance, second distance, root distance) """ # get the root-augmented distance matrices D_aug = get_root_augmented_distance_matrix(tree, first_taxa, second_taxa) # get the R matrix R_aug = Clustering.get_R_balaji(D_aug) # Get the matrix whose elements are block element sums # of the root-augmented R matrix. block_structure = [0]*len(first_taxa) + [1]*len(second_taxa) + [2] B = [[0]*3 for i in range(3)] for i, block_i in enumerate(block_structure): for j, block_j in enumerate(block_structure): B[block_i][block_j] += R_aug[i][j] # get the new branch lengths for the subtree denominator = 2 * (B[0][1]*B[1][2] + B[1][2]*B[2][0] + B[2][0]*B[0][1]) blen_first = B[1][2] / denominator blen_second = B[2][0] / denominator blen_root = B[0][1] / denominator return blen_first, blen_second, blen_root
def do_second_method(tree, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out): # get the covariance matrix ordered_names = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2 cov = np.array(tree.get_covariance_matrix(ordered_names)) # invert the covariance matrix to make the precision matrix prec = np.linalg.inv(cov) # take the block sums of the precision matrix block_structure = get_block_structure(taxa_a1, taxa_a2, taxa_b1, taxa_b2) name_order = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2 block_prec = [[0]*4 for i in range(4)] for i, block_i in enumerate(block_structure): for j, block_j in enumerate(block_structure): block_prec[block_i][block_j] += prec[i][j] # invert the block summed precision matrix reduced_cov = np.linalg.inv(np.array(block_prec)) # extract the branch lengths from the reduced covariance matrix a = reduced_cov[0][0] - reduced_cov[0][1] b = reduced_cov[1][1] - reduced_cov[0][1] c = reduced_cov[2][2] - reduced_cov[2][3] d = reduced_cov[3][3] - reduced_cov[2][3] e = reduced_cov[0][1] + reduced_cov[2][3] # define the distance matrix for the reduced tree reduced_D = [ [0, a+b, a+e+c, a+e+d], [b+a, 0, b+e+c, b+e+d], [c+e+a, c+e+b, 0, c+d], [d+e+a, d+e+b, d+c, 0]] # get the R matrix of the reduced tree reduced_R = Clustering.get_R_balaji(reduced_D) print >> out, 'second method:' print >> out, 'equivalent subtree A:', '(a1:%f, a2:%f);' % (a, b) print >> out, 'equivalent subtree B:', '(b1:%f, b2:%f);' % (c, d) print >> out, 'equivalent connecting branch length:', e print >> out, 'M for the equivalent tree:' print >> out, MatrixUtil.m_to_string(reduced_R) print >> out
def run(my_map, reviews, restaurants): restaurants = Clustering.filter_restaurants(restaurants) normalized_restaurant_ids_to_topics, lda = Clustering.get_predictions(my_map, reviews, restaurants) create_gaussian_clusters_and_map(restaurants, normalized_restaurant_ids_to_topics, my_map, lda)
#print data about each category of videos # print 'ID: '+ str(vidCat) # if 'en' in miniLangList.keys(): # sent = sentimentAnalyzer(miniLangList['en']) # print 'Average Polarity: {:1.5f}'.format(sent[0]) # print 'Average Extremity: {:1.5f}'.format(sent[1]) # freqs = wordFrequency(miniLangList['en']) # print 'Top 20:' # print freqs[0:20] # spell = spellCheck(miniLangList['en']) # print 'Spelled Right: {}\nSpelled Wrong: {}'.format(spell[0], spell[1])#,spell[2]) # print[(k, len(miniLangList[k])) for k in sorted(miniLangList.keys())] # print '' print 'ALL' sent = sentimentAnalyzer(masterList['en']) print 'Average Polarity: {:1.5f}'.format(sent[0]) print 'Average Extremity: {:1.5f}'.format(sent[1]) freqs = wordFrequency(masterList['en']) print 'Top 20:' print freqs[0:100] spell = spellCheck(masterList['en']) print 'Spelled Right: {}\nSpelled Wrong: {}'.format(spell[0], spell[1])#,spell[2]) print 'Percentage Mispelled: {}'.format(spell[1]*100.0/(spell[0]+spell[1])) langs = [(k, len(masterList[k])) for k in masterList.keys()] print sorted(langs, key=lambda x:x[1],reverse=True) Clustering.kMeans(masterList['en'], numClusters=10, numDefiningWords=15) #Clustering.dumbClustering(masterList['en'])
def getMaxSimilarityForUPGMA(d): maxSimilarity = Clustering.getMax(d) return maxSimilarity
datawriter = csv.writer(dfile, delimiter=',') labelwriter = csv.writer(lfile, delimiter=',') infowriter = csv.writer(ifile, delimiter=',') lcount = 1 #line count # get test name into a dictionary test_dict = testprocessing.get_test(reader,start_col) rfile.seek(0) # get diagnostic info into a dictionary diag_dict = testprocessing.read_labtestcvs("LabTestsInfo.csv", test_dict) # get the latest lab test result for every patient latest_test_data = testprocessing.convert_test_line(reader,test_dict,start_col) rfile.seek(0) # Get the class label for each state with respect to patient's health states classification = Clustering.runKmeans(state_file) # append a empty row to the first row infowriter.writerow([]) datawriter.writerow([]) labelwriter.writerow([]) for index, row in enumerate(reader): # if lcount < 20: if proc_train: convert_line(classification[index],latest_test_data[index], diag_dict, row, datawriter, labelwriter, infowriter, True) ''' if index >20: break ''' else:
def get_response_content(fs): # read the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # create a putative list of nodes putative_nodes = [] putative_nodes.extend(list(tree.gen_tips())) if not fs.standard: putative_nodes.extend(list(tree.gen_internal_nodes())) # get the ordered ids and ordered names of the selected nodes in the tree ordered_name_id_pairs = [] for node in putative_nodes: name = node.get_name() if fs.named and not name: continue ordered_name_id_pairs.append((name, id(node))) ordered_ids = [id_ for name, id_ in ordered_name_id_pairs] ordered_names = [name for name, id_ in ordered_name_id_pairs] id_to_index = dict((id_, i) for i, id_ in enumerate(ordered_ids)) # assert that names are non-empty for name in ordered_names: if not name: raise HandlingError("each node must be named") # assert that names are unique n = len(ordered_ids) if len(set(ordered_names)) != n: raise HandlingError("each node must be uniquely named") # get the R matrix from the tree; this is -1/2 times the laplacian matrix if fs.standard: D = tree.get_distance_matrix(ordered_names) elif fs.augmented: D = tree.get_full_distance_matrix(ordered_ids) elif fs.named: D = tree.get_partial_distance_matrix(ordered_ids) R = Clustering.get_R_balaji(D) R_trace = sum(R[i][i] for i in range(n)) # get the best partition and partition value value_Y_pairs = [] for Y in Clustering.gen_assignments(n): value = Clustering.get_exact_criterion(R, Y) value_Y_pairs.append((value, Y)) best_value, best_Y = max(value_Y_pairs) # convert the best Y vector to a partition pos_set = set(ordered_names[i] for i, el in enumerate(best_Y) if el > 0) neg_set = set(ordered_names[i] for i, el in enumerate(best_Y) if el < 0) # get fiedler split information fiedler_eigenvector = get_eigenvector_of_interest(R) fiedler_pos_set = set(ordered_names[i] for i, elem in enumerate(fiedler_eigenvector) if elem > 0) fiedler_neg_set = set(ordered_names[i] for i, elem in enumerate(fiedler_eigenvector) if elem < 0) # write the paragraphs paragraphs = [] if fs.show_split: lines = [ "exact criterion partition:", str(list(best_Y)), set_to_string((set_to_string(neg_set), set_to_string(pos_set))), ] paragraphs.append("\n".join(lines)) if fs.show_value: lines = ["exact criterion value:", str(best_value)] paragraphs.append("\n".join(lines)) if fs.show_value_minus_trace: lines = ["exact criterion value minus trace:", str(best_value - R_trace)] paragraphs.append("\n".join(lines)) if fs.show_fiedler_split: lines = [ "spectral sign partition:", set_to_string((set_to_string(fiedler_neg_set), set_to_string(fiedler_pos_set))), ] paragraphs.append("\n".join(lines)) if fs.show_fiedler_eigenvector: lines = ["eigenvector of interest:", str(list(fiedler_eigenvector))] paragraphs.append("\n".join(lines)) if fs.show_labels: lines = ["ordered labels:"] + ordered_names paragraphs.append("\n".join(lines)) if fs.show_distance_matrix: if fs.augmented: title = "augmented distance matrix:" elif fs.standard: title = "distance matrix:" elif fs.named: title = "distance matrix:" lines = [title, MatrixUtil.m_to_string(D)] paragraphs.append("\n".join(lines)) if fs.show_M_matrix: lines = ["M matrix:", MatrixUtil.m_to_string(R)] paragraphs.append("\n".join(lines)) # return the response return "\n\n".join(paragraphs) + "\n"
def run(my_map, reviews, restaurants): restaurants = Clustering.filter_restaurants(restaurants, reviews) normalized_restaurant_ids_to_topics, lda = Clustering.get_predictions(my_map, reviews, restaurants) elbow_clustering(restaurants, normalized_restaurant_ids_to_topics, my_map)
def elbow_clustering(restaurants, restaurant_ids_to_topics, my_map): data = Clustering.create_data_array(restaurants, restaurant_ids_to_topics, my_map) print "starting elbow clustering" ElbowClustering.plot_elbow_and_gap(data)
def get_response_content(fs): # read the values from the form subtree_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree) taxa_a1 = Util.get_stripped_lines(StringIO(fs.taxa_a1)) taxa_a2 = Util.get_stripped_lines(StringIO(fs.taxa_a2)) subtree_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree) taxa_b1 = Util.get_stripped_lines(StringIO(fs.taxa_b1)) taxa_b2 = Util.get_stripped_lines(StringIO(fs.taxa_b2)) connecting_branch_length = fs.blen # assert that no group of taxa contains duplicates for taxa in (taxa_a1, taxa_a2, taxa_b1, taxa_b2): if len(set(taxa)) != len(taxa): raise HandlingError('one of the lists of taxa contains duplicates') # assert that each subtree has at least two tips and no duplicates for tree in (subtree_a, subtree_b): tip_names = list(node.get_name() for node in tree.gen_tips()) if len(tip_names) < 2: raise HandlingError('each subtree should have at least two tips') if len(set(tip_names)) != len(tip_names): raise HandlingError('a subtree has duplicate tip names') # assert that the partitions are valid first_group = ('A', subtree_a, taxa_a1, taxa_a2) second_group = ('B', subtree_b, taxa_b1, taxa_b2) for tree_name, tree, taxa_1, taxa_2 in (first_group, second_group): tip_names = set(node.get_name() for node in tree.gen_tips()) for group_name, taxa in (('1', taxa_1), ('2', taxa_2)): nonsense_names = list(set(taxa) - set(tip_names)) msg_a = 'the following taxa in group %s ' % group_name msg_b = 'of subtree %s ' % tree_name msg_c = 'are not valid tips: %s' % str(nonsense_names) message = msg_a + msg_b + msg_c if nonsense_names: raise HandlingError(message) if set(taxa_1) & set(taxa_2): msg_a = 'the taxon lists for subtree %s ' % tree_name msg_b = 'are not disjoint' raise HandlingError(msg_a + msg_b) if set(taxa_1) | set(taxa_2) < tip_names: msg_a = 'a tip in subtree %s ' % tree_name msg_b = 'is not represented in either of the groups' raise HandlingError(msg_a + msg_b) # define the response out = StringIO() # get the results for the first method do_first_method(subtree_a, subtree_b, taxa_a1, taxa_a2, taxa_b1, taxa_b2, connecting_branch_length, out) # define the entire tree by connecting the subtrees subtree_b.get_root().set_branch_length(connecting_branch_length) subtree_a.get_root().add_child(subtree_b.get_root()) tree = subtree_a # define the order and structure of the distance matrix block_structure = get_block_structure(taxa_a1, taxa_a2, taxa_b1, taxa_b2) name_order = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2 # get the distance matrix fel_tree = NewickIO.parse(NewickIO.get_newick_string(tree), FelTree.NewickTree) D = fel_tree.get_distance_matrix(name_order) # get the R matrix R = Clustering.get_R_balaji(D) # get the sums of block elements of R block_R = [[0]*4 for i in range(4)] for i, block_i in enumerate(block_structure): for j, block_j in enumerate(block_structure): block_R[block_i][block_j] += R[i][j] # show the results from the second method do_second_method(fel_tree, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out) # show the results from the third method tree_m3_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree) tree_m3_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree) for t in (tree_m3_a, tree_m3_b): neo = Newick.NewickNode() neo.name = 'special' neo.blen = connecting_branch_length / 2 t.get_root().add_child(neo) feltree_m3_a = NewickIO.parse(NewickIO.get_newick_string(tree_m3_a), FelTree.NewickTree) feltree_m3_b = NewickIO.parse(NewickIO.get_newick_string(tree_m3_b), FelTree.NewickTree) tree_m3_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree) tree_m3_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree) new_root = Newick.NewickNode() tree_m3_a.get_root().blen = connecting_branch_length / 2 tree_m3_b.get_root().blen = connecting_branch_length / 2 new_root.add_child(tree_m3_a.get_root()) new_root.add_child(tree_m3_b.get_root()) tree_m3 = Newick.NewickTree(new_root) feltree_m3 = NewickIO.parse(NewickIO.get_newick_string(tree_m3), FelTree.NewickTree) branch_d2 = connecting_branch_length / 2 do_third_method(feltree_m3_a, feltree_m3_b, feltree_m3, branch_d2, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out) # show the expected results print >> out, 'M:' print >> out, MatrixUtil.m_to_string(R) print >> out, 'M summed within blocks:' print >> out, MatrixUtil.m_to_string(block_R) # return the response return out.getvalue()
def _Go(): global flag;global num_days;global length;global steps; global num_days;global f;global sa;global plots;global curr_pos sel_task = task.get() if (data_path == None): display_error(1) elif (sel_task == 'None'): display_error(2) elif (sel_task == 'Summarize'): curr_pos = 0 plots = [] name = str(stock.get()) if name != "None": lot = length.get() if lot=="": display_error(3) else: lot = int(lot) if type(lot) != int: display_error(4) elif lot <= 0: display_error(5) if lot != "" and type(int(lot)) == int and int(lot) > 0: data = LoadTimeseries.read_p_timeseries(data_path, name, lot) original, summarized = Summarize.summarize(data) plt.clf() #yearsFmt = mdates.DateFormatter('%b %Y') f = plt.figure(figsize=(6,6), dpi=100, facecolor='white') plt.subplot(411) plt.title("Initial Timeserie") original[name].ix[original[name].index].plot(style='b') ## print(summarized[name]["extremas-x"]) ## tps = [] ## tps_ind = [] ## for i in range(len(original[name])): ## if i in summarized[name]["extremas-x"]: ## tps.append(original[name][i]) ## tps_ind.append(original[name].index[i]) ## print(tps_ind) ## print(tps) ## f = Series(tps, index=tps_ind).ix[tps_ind].plot(style='r',marker='o', linestyle="", markersize=7) plt.subplot(412) plt.title("Overall Timeserie Trend") plt.plot(summarized[name]["trend"]["x"],summarized[name]["trend"]["r"],'g') plt.xlim([0,lot]) plt.subplot(413) plt.title("Turning Points") plt.plot(summarized[name]["extremas-x"], summarized[name]["extremas"], 'o', mfc='none', markersize=7) plt.xlim([0,lot]) plt.subplot(414) plt.title("Seasonality and Cycle") plt.plot(summarized[name]["Ds-x"], summarized[name]["Ds"], 'r') plt.xlim([0,lot]) plt.tight_layout() canvas.figure = f canvas.draw() else: display_error(6) elif sel_task == 'Clustering': n = num.get() lot = length.get() if n == "": display_error(7) else: n = int(n) if type(n) != int: display_error(8) elif n <= 0: display_error(9) if lot == "": display_error(10) else: lot = int(lot) if type(lot) != int: display_error(11) elif lot <= 0: display_error(12) if type(lot) == int and type(n) == int and lot > 0 and n > 0: curr_pos = 0 plots = [] data, aDates = LoadTimeseries.read_c_timeseries(data_path, n, lot) original, summarized = Summarize4Clustering.summarize(data, aDates, lot) plt.clf() curr_pos = 0 plots = Clustering.cluster(original, summarized, n) canvas.figure = plots[0] canvas.draw() elif sel_task == 'Prediction': curr_pos = 0 plots = [] name = str(stock.get()) if name != "None": lot = length.get() sp = steps.get() if sp == "": display_error(13) else: sp = int(sp) if type(sp) != int: display_error(14) elif sp <= 0: display_error(15) if lot != "": lot = int(lot) if type(lot) != int: display_error(11) elif lot <= 0: display_error(12) if lot+sp > num_days: display_error(16) if type(lot) == int and lot > 0 and type(sp) == int and sp > 0 and lot+sp<=num_days: data = LoadTimeseries.read_p_timeseries(data_path, name, lot+sp) original, summarized = Summarize4Prediction.summarize(data,sp) orig_pre, orig_r_pre, sum_pre = Prediction.predict(original, summarized[name]["Ds"], summarized[name]["AL"], sp) plt.clf() # print(original) f = plt.figure(figsize=(6,6), dpi=100, facecolor='white') original[name].ix[original[name].index[0]:].plot(style='b', label="Actual") orig_pre.ix[orig_pre.index[0]:].plot(style='r',label="ARMA") orig_r_pre.ix[orig_r_pre.index[0]:].plot(style='purple',label="r-ARMA") sum_pre.ix[orig_pre.index[0]:].plot(style='g', label="Summarized") plt.legend(loc=2,prop={'size':10}) canvas.figure = f canvas.draw() else: display_error(3)
def do_third_method(tree_a, tree_b, tree, branch_d2, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out): print >> out, 'third method:' # get the covariance matrices of the mini trees ordered_names = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2 ordered_names_a = taxa_a1 + taxa_a2 + ['special'] ordered_names_b = taxa_b1 + taxa_b2 + ['special'] block_structure = get_block_structure(taxa_a1, taxa_a2, taxa_b1, taxa_b2) block_structure_a = [0]*len(taxa_a1) + [1]*len(taxa_a2) + [2] block_structure_b = [0]*len(taxa_b1) + [1]*len(taxa_b2) + [2] cov = np.array(tree.get_covariance_matrix(ordered_names)) cov_a = np.array(tree_a.get_covariance_matrix(ordered_names_a)) cov_b = np.array(tree_b.get_covariance_matrix(ordered_names_b)) prec_a = np.linalg.inv(cov_a) block_prec_a = [[0]*3 for i in range(3)] for i, block_i in enumerate(block_structure_a): for j, block_j in enumerate(block_structure_a): block_prec_a[block_i][block_j] += prec_a[i][j] prec_b = np.linalg.inv(cov_b) block_prec_b = [[0]*3 for i in range(3)] for i, block_i in enumerate(block_structure_b): for j, block_j in enumerate(block_structure_b): block_prec_b[block_i][block_j] += prec_b[i][j] a = block_prec_a[0][0] b = block_prec_a[0][1] d = block_prec_a[1][1] e = block_prec_b[0][0] f = block_prec_b[0][1] h = block_prec_b[1][1] x = branch_d2 # make the block M matrix using a clever formula Q_a = [ [a, b, 0, 0], [b, d, 0, 0], [0, 0, e, f], [0, 0, f, h]] den_a = (a + 2*b + d + 1/x) den_b = (e + 2*f + h + 1/x) Q_b = [ [(a+b)*(a+b)/den_a, (a+b)*(b+d)/den_a, 0, 0], [(b+d)*(a+b)/den_a, (b+d)*(b+d)/den_a, 0, 0], [0, 0, (e+f)*(e+f)/den_b, (e+f)*(f+h)/den_b], [0, 0, (f+h)*(e+f)/den_b, (f+h)*(f+h)/den_b]] glom = a+2*b+d+e+2*f+h+2*x*(a+2*b+d)*(e+2*f+h) den_a2 = (den_a / den_b) * glom den_b2 = (den_b / den_a) * glom Q_c = [ [(a+b)*(a+b)/den_a2, (a+b)*(b+d)/den_a2, (a+b)*(e+f)/glom, (a+b)*(f+h)/glom], [(b+d)*(a+b)/den_a2, (b+d)*(b+d)/den_a2, (b+d)*(e+f)/glom, (b+d)*(f+h)/glom], [(e+f)*(a+b)/glom, (e+f)*(b+d)/glom, (e+f)*(e+f)/den_b2, (e+f)*(f+h)/den_b2], [(f+h)*(a+b)/glom, (f+h)*(b+d)/glom, (f+h)*(e+f)/den_b2, (f+h)*(f+h)/den_b2]] Q = np.array(Q_a) - np.array(Q_b) - np.array(Q_c) print >> out, 'cleverly constructed block M:' print >> out, MatrixUtil.m_to_string(Q) # make the equivalent tree a_star = (b+d)/(a*d-b*b) b_star = (a+b)/(a*d-b*b) c_star = (f+h)/(e*h-f*f) d_star = (e+f)/(e*h-f*f) e_star = 2*x - b/(a*d-b*b) - f/(e*h-f*f) print >> out, 'using the block precision matrix:' print >> out, 'equivalent subtree A:', '(a1:%f, a2:%f);' % (a_star, b_star) print >> out, 'equivalent subtree B:', '(b1:%f, b2:%f);' % (c_star, d_star) print >> out, 'equivalent connecting branch length:', e_star # make the block M matrix using Eric's formula (corrected) A, B, C, D, E = a_star, b_star, c_star, d_star, e_star H = A*B*(C+D) + C*D*(A+B) + E*(C+D)*(A+B) Q = [[1/H]*4 for i in range(4)] Q[0][0] *= B*D + B*C + C*D + E*(C+D) Q[0][1] *= -C*D - E*(C+D) Q[0][2] *= -B*D Q[0][3] *= -B*C Q[1][0] *= -C*D - E*(C+D) Q[1][1] *= A*D + A*C + C*D + E*(C+D) Q[1][2] *= -A*D Q[1][3] *= -A*C Q[2][0] *= -B*D Q[2][1] *= -A*D Q[2][2] *= A*B + B*D + A*D + E*(A+B) Q[2][3] *= -A*B - E*(A+B) Q[3][0] *= -B*C Q[3][1] *= -A*C Q[3][2] *= -A*B - E*(A+B) Q[3][3] *= A*B + B*C + A*C + E*(A+B) print >> out, 'reconstructed block M:' print >> out, MatrixUtil.m_to_string(Q) M = Clustering.get_R_balaji(cov) M_a = Clustering.get_R_balaji(cov_a) M_b = Clustering.get_R_balaji(cov_b) block_M = [[0]*4 for i in range(4)] for i, block_i in enumerate(block_structure): for j, block_j in enumerate(block_structure): block_M[block_i][block_j] += M[i][j] block_M_a = [[0]*3 for i in range(3)] for i, block_i in enumerate(block_structure_a): for j, block_j in enumerate(block_structure_a): block_M_a[block_i][block_j] += M_a[i][j] block_M_b = [[0]*3 for i in range(3)] for i, block_i in enumerate(block_structure_b): for j, block_j in enumerate(block_structure_b): block_M_b[block_i][block_j] += M_b[i][j] c_1 = block_M_a[0][1] c_2 = block_M_a[0][2] c_3 = block_M_a[1][2] denominator = (c_1*c_2) + (c_2*c_3) + (c_3*c_1) a_star = -c_3 / denominator b_star = -c_2 / denominator e_star_a = -c_1 / denominator c_1 = block_M_b[0][1] c_2 = block_M_b[0][2] c_3 = block_M_b[1][2] denominator = (c_1*c_2) + (c_2*c_3) + (c_3*c_1) c_star = -c_3 / denominator d_star = -c_2 / denominator e_star_b = -c_1 / denominator e_star = e_star_a + e_star_b print >> out, 'using the block M matrix:' print >> out, 'equivalent subtree A:', '(a1:%f, a2:%f);' % (a_star, b_star) print >> out, 'equivalent subtree B:', '(b1:%f, b2:%f);' % (c_star, d_star) print >> out, 'equivalent connecting branch length:', e_star print >> out, 'calculated block M:' print >> out, MatrixUtil.m_to_string(block_M) print >> out
help='a singe filename to read from (can be compiled using feedvector)') parser.add_argument("--draw", "-d", help='this outputs a dendrogram jpg for hclusters and a 2d map for kclusters', action="store_true") parser.add_argument("--rotate", "-r", help='flip the rows and colums', action="store_true") parser.add_argument("--kcluster", "-k", help="uses a kcluster instead of an hcluster", action="store_true") parser.add_argument("--scaledown", "-s", help="uses scaling algorithm for clustering display", action="store_true") args = parser.parse_args() blognames, words, data = readfile(str(args.file)) if args.rotate: data = Clustering.rotatematrix(data) print "data rotated" if args.kcluster: clust = Clustering.hcluster(data) else: clust = Clustering.hcluster(data) if args.rotate: Clustering.printclust(clust, labels=words) else: Clustering.printclust(clust, labels=blognames) if(args.draw): if args.kcluster: print repr(kcluster)
def main() : # Start of TRAP sys.stdout.write("\n###### Pathway and clustering analysis ######\n\n") sys.stdout.flush() cuffPath = "cufflinks_result" diffPath = "cuffdiff_result" resultPath = "TRAP_result" controlList = [] caseList = [] diffList = [] timeLen = 0 geneIDPath = "" pnamePath = "" kgmlPath = "" xmlPath = "" cuffdiff= "" pCut = 0.05 DEGCut = 2.0 clusterCut = 2.0 timeLag = 1.0 fcList = {} # fcList[geneID]=[fc_0, ... , fc_t] pVal = {} # pVal[geneID]=[p_0, ..., p_t] idDic = {} # idDic[keggID]=geneID pnameDic = {} # pnameDic[pID]=pathwayName # Reading configuration file sys.stdout.write("Reading configuration file\t......\t") sys.stdout.flush() try : config = open("config.txt", "r") while True : cl = config.readline() if cl=="" : break tp = cl.split("=") if (len(tp)<2) : continue key=tp[0].strip() val=tp[1].strip() if (key[:7]=="control") : controlList.append(val.split(',')) elif (key[:9]=="treatment") : caseList.append(val.split(',')) elif (key=="numTP") : timeLen=int(val) elif (key=="convfilePath") : geneIDPath = val elif (key=="pnamePath") : pnamePath = val elif (key=="kgmlPath") : kgmlPath = val elif (key=="cuffdiff") : cuffdiff = val elif (key=="pVal") : pCut = float(val) elif (key[:4]=="diff") : diffList.append(val) elif (key=="DEGCut") : DEGCut = float(val) elif (key=="clusterCut") : clusterCut = float(val) elif (key=="timeLag") : timeLag = float(val) else : continue idFile = open(geneIDPath, "r") pnameFile = open(pnamePath, "r") xmlPath = os.walk(kgmlPath) if (cuffdiff=="no" and len(controlList)!=len(caseList)) : raise except IOError: print "Check if the configuration file exists" except : print "Configuration file error" raise # Make sure result path exists try: os.makedirs(resultPath) except OSError: if not os.path.isdir(resultPath): raise # Copy config file so we don't get confused of what params have been set copy("config.txt", resultPath) # Reading ID-conversion / pathway name file for ids in idFile.readlines() : tp = ids.split("\t") tp2 = tp[1].split(";") tp3 = tp2[0].split(", ") if tp[0] in idDic : for name in tp3 : idDic[tp[0]].append(name.strip()) else : idDic[tp[0]]=[] for name in tp3 : idDic[tp[0]].append(name.strip()) idFile.close() for path in pnameFile.readlines() : tp = path.split("\t") tp2 = tp[0].split(":") tp3 = tp[1].split(" - ") pnameDic[tp2[1]]=tp3[0] pnameFile.close() sys.stdout.write("Done\n") sys.stdout.flush() # Reading fpkm file sys.stdout.write("Reading expression files\t......\t") sys.stdout.flush() geneSum = set() if cuffdiff=="yes" : for j in range(timeLen) : pfile = open(os.path.join(diffPath, diffList[j], "gene_exp.diff"), "r") for l in pfile.readlines() : if l.startswith('#'): continue tp = l.split() if not is_number(tp[9]) : continue geneSum.add(tp[2]) pfile.close() for gene in geneSum : fcList[gene]=[] pVal[gene]=[] for j in range(timeLen) : pfile = open(os.path.join(diffPath, diffList[j], "gene_exp.diff"), "r") temp = {} temp2 = {} for l in pfile.readlines() : if l.startswith('#'): continue tp = l.split() if not is_number(tp[9]) : continue if ( tp[9]=='inf' or tp[9] == '-inf') : temp[tp[2]]=0 else: temp[tp[2]]=float(tp[9]) temp2[tp[2]]=float(tp[12]) for gene in geneSum : if gene in temp : fcList[gene].append(temp[gene]) pVal[gene].append(temp2[gene]) else : fcList[gene].append(0) pVal[gene].append(1) pfile.close() else : for j in range(timeLen) : for con in controlList[j] : pfile = open(os.path.join(cuffPath, con, "genes.fpkm_tracking"), "r") for l in pfile.readlines() : tp = l.split() if not is_number(tp[9]) : continue geneSum.add(tp[4]) pfile.close() for case in caseList[j] : pfile = open(os.path.join(cuffPath, case, "genes.fpkm_tracking"), "r") for l in pfile.readlines() : tp = l.split() if not is_number(tp[9]) : continue geneSum.add(tp[4]) pfile.close() for gene in geneSum : fcList[gene]=[] for j in range(timeLen) : temp1 = {} temp2 = {} for con in controlList[j] : pfile = open(os.path.join(cuffPath, con, "genes.fpkm_tracking"), "r") for l in pfile.readlines() : tp = l.split() if (tp[9]=="FPKM") : continue if tp[4] in temp1 : temp1[tp[4]].append(float(tp[9])) else : temp1[tp[4]]=[float(tp[9])] pfile.close() for case in caseList[j] : pfile = open(os.path.join(cuffPath, case, "genes.fpkm_tracking"), "r") for l in pfile.readlines() : tp = l.split() if (tp[9]=="FPKM") : continue if tp[4] in temp2 : temp2[tp[4]].append(float(tp[9])) else : temp2[tp[4]]=[float(tp[9])] pfile.close() for gene in geneSum : med1 = 0 med2 = 0 if gene in temp1 and gene in temp2 : med1 = median(temp1[gene]) med2 = median(temp2[gene]) elif gene in temp1 : med1 = median(temp1[gene]) elif gene in temp2 : med2 = median(temp2[gene]) else : med1 = med1 med2 = med2 if (abs(med2-med1)<1.0) : fcList[gene].append(0) else : fcList[gene].append(math.log((med2+0.01)/(med1+0.01),2)) sys.stdout.write("Done\n") sys.stdout.flush() # Parsing xml file to get gene and relation information sys.stdout.write("Reading xml files\t\t......\t") sys.stdout.flush() i=0 ind = {} DEG = [] wgene = [] wredic = [] empty = [] empty2 = [] for t in range(0, timeLen) : wgene.append([]) #wgene[t][i]={keggID:fc} DEG.append([]) #DEG[t][i]=set(keggID) empty.append(0) empty2.append(1) for root,dirs,files in xmlPath: for file in files: filetp = file.split(".") ind[filetp[0]]=i for j in range(0, timeLen) : wgene[j].append({}) DEG[j].append(set()) wredic.append({}) #wredic[i]={keggID:(list of [asc, length, j])} xmlfile = open(os.path.join(kgmlPath, file), "r") xmldata = xmlfile.read() dom = parseString(xmldata) xmlfile.close() geneSet = set() entrydic = {} entries = dom.getElementsByTagName("entry") for e in entries : if (e.attributes.getNamedItem("type").nodeValue == 'gene') : id = e.attributes.getNamedItem("id").nodeValue id = str(id) genes = e.attributes.getNamedItem("name").nodeValue genes = str(genes) genelist = genes.split() entrydic[id]=[] for g in genelist : entrydic[id].append(g) geneSet.add(g) elif (e.attributes.getNamedItem("type").nodeValue == 'group') : id = e.attributes.getNamedItem("id").nodeValue id = str(id) comps = e.getElementsByTagName("component") entrydic[id]=[] for c in comps : geneId =c.attributes.getNamedItem("id").nodeValue for g in entrydic[geneId] : entrydic[id].append(g) geneSet.add(g) for g in geneSet : if (g in idDic) : nameExist = 0 tpName = "" for name in idDic[g] : if name in fcList.keys() : nameExist = 1 tpName = name break if nameExist==1 : for t in range(0, timeLen) : foldchange = fcList[tpName][t] wgene[t][i][g]=foldchange if (cuffdiff=="yes" and pVal[tpName][t]<=pCut and abs(foldchange)>=DEGCut) : DEG[t][i].add(g) elif (cuffdiff=="no" and abs(foldchange)>=DEGCut) : DEG[t][i].add(g) else : for t in range(0, timeLen) : wgene[t][i][g]=0 fcList[idDic[g][0]]=empty if (cuffdiff=="yes") : pVal[idDic[g][0]]=empty2 else : for t in range(0, timeLen) : wgene[t][i][g]=0 fcList[g]=empty if (cuffdiff=="yes") : pVal[g]=empty2 redic = wredic[i] relations = dom.getElementsByTagName("relation") for r in relations : subs = r.getElementsByTagName("subtype") ent1 = r.attributes.getNamedItem("entry1").nodeValue ent2 = r.attributes.getNamedItem("entry2").nodeValue if (not (subs==[])) : for s in subs : type = s.attributes.getNamedItem("name").nodeValue if (type=="activation" or type=="expression") : j=1 elif (type=="inhibition" or type=="repression") : j=-1 else : j=0 if (j!=0 and (ent1!=ent2) and (ent1 in entrydic) and (ent2 in entrydic)) : for desc in entrydic[ent2] : length = len(entrydic[ent2]) for asc in entrydic[ent1] : if (desc in redic) : redic[desc].append([asc, length, j]) else : redic[desc]=[[asc, length, j]] i=i+1 fileN = i sys.stdout.write("Done\n") sys.stdout.flush() # 1. One time point SPIA analysis sys.stdout.write("One time point SPIA analysis\n") sys.stdout.flush() for t in range(0, timeLen) : sys.stdout.write("\t"+str(t+1)+"th time point\t......\t") sys.stdout.flush() OT.pathwayAnalysis(os.path.join(resultPath, "OneTime_"+str(t+1)), fileN, wgene[t], wredic, DEG[t], DEGCut, idDic, pnameDic, ind) sys.stdout.write("Done\n") sys.stdout.flush() # 2. Time-series SPIA analysis sys.stdout.write("Time-series SPIA analysis\t......\t") sys.stdout.flush() TS.pathwayAnalysis(os.path.join(resultPath, "TimeSeries"), wgene, wredic, DEG, idDic, pnameDic, timeLag, timeLen, ind, fcList) sys.stdout.write("Done\n") sys.stdout.flush() # 3. Clustering analysis sys.stdout.write("Clustering Analysis\t\t......\t") sys.stdout.flush() CL.clusteringAnalysis(os.path.join(resultPath, "Clustering"), wgene, fcList, pVal, idDic, pnameDic, clusterCut, pCut, timeLen, ind, cuffdiff) sys.stdout.write("Done\n") sys.stdout.flush()
def create_gaussian_clusters_and_map(restaurants, restaurant_ids_to_topics, my_map, lda, use_human_labels=True): data = Clustering.create_data_array(restaurants, restaurant_ids_to_topics, my_map) Clustering.plot_gaussian_clusters(my_map, restaurants, restaurant_ids_to_topics, data, lda)