Ejemplo n.º 1
0
def get_response_content(fs):
    # read the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # get ordered identifiers
    ordered_tip_name_id_pairs = list(sorted(set((node.get_name(), id(node))
        for node in tree.gen_tips())))
    ordered_tip_names, ordered_tip_ids = zip(*ordered_tip_name_id_pairs)
    ordered_internal_ids = [id(node)
            for node in tree.preorder() if not node.is_tip()]
    ordered_ids = list(ordered_tip_ids) + ordered_internal_ids
    # get the distance matrices
    full_D = tree.get_partial_distance_matrix(ordered_ids)
    partial_D = tree.get_partial_distance_matrix(ordered_tip_ids)
    # get the balaji matrices
    full_R = Clustering.get_R_balaji(full_D)
    partial_R = Clustering.get_R_balaji(partial_D)
    # Get the fiedler eigenvector and another eigenvector
    # for the full and the partial balaji matrices.
    full_va, full_vb = get_eigenvectors(full_R)
    partial_va, partial_vb = get_eigenvectors(partial_R)
    # create the response
    out = StringIO()
    print >> out, 'Fiedler vector associated with the graph'
    print >> out, 'for which the internal nodes are hidden:'
    print >> out, str(tuple(partial_va))
    print >> out
    print >> out, 'The tip subvector of the Fiedler vector'
    print >> out, 'associated with the graph of the full tree:'
    print >> out, str(tuple(full_va[:len(ordered_tip_ids)]))
    # write the response
    return out.getvalue()
def run(my_map, reviews, restaurants):
    restaurants = Clustering.filter_restaurants(restaurants)
    normalized_restaurant_ids_to_topics, lda = Clustering.get_predictions(
        my_map, reviews, restaurants)
    create_topic_clusters_and_map(restaurants,
                                  normalized_restaurant_ids_to_topics, my_map,
                                  lda)
Ejemplo n.º 3
0
    def plot_clusters(self,
                      clustering: Clustering,
                      title_fs=14,
                      label_fs=12,
                      ticks_fs=12):
        plt.figure(figsize=self.figsize)
        labels = clustering.get_labels()
        n_clusters = clustering.get_n_clusters()
        title = clustering.get_title()
        self.colors = sns.color_palette(palette=clustering.get_palette(),
                                        n_colors=n_clusters).as_hex()

        plt.scatter(self.visualized_data.values[:, 0],
                    self.visualized_data.values[:, 1],
                    s=30,
                    c=[self.cluster_color(label) for label in labels],
                    alpha=0.5)

        clusters_ax = plt.gca()
        clusters_ax.set_title(title, fontsize=title_fs)
        clusters_ax.set_xlabel('dim1', fontsize=label_fs)
        clusters_ax.set_ylabel('dim2', fontsize=label_fs)
        for tick in clusters_ax.xaxis.get_major_ticks(
        ) + clusters_ax.yaxis.get_major_ticks():
            tick.label.set_fontsize(ticks_fs)

        plt.savefig(self.plots_dir + title)
Ejemplo n.º 4
0
def run_pipeline_with_pretrained_doc2vec():
    all_documents = DataLoader.load_all_documents()
    adjacency_matrix_references_all_documents = DataLoader.load_adjacency_matrix_all_documents(
    )
    # Load model
    model = DataLoader.load_model()
    # Runs HDBSCAN, returns a list of labels (a label for each documents. -1 == outlier)
    labels = Clustering.run_hdbscan(model=model,
                                    min_cluster_size=4,
                                    min_samples=4)

    # Extracts the documents which have been clustered such that we have no outliers
    # Mask denotes the ones to include and exclude. Labels of the clustered documents and the clustered documents
    mask, labels_subset, clustered_documents = Clustering.extract_clustered_documents(
        all_documents, labels)

    # Creates the adjacency matrix for references between clusters
    cluster_references_adjacency = Clustering.create_adjacency_matrix_for_clusters(
        mask=mask,
        labels=labels_subset,
        adjacency_references_all_documents=
        adjacency_matrix_references_all_documents)

    # k-nearest undirected adjacency
    cluster_references_adjacency = Clustering.make_adjacency_matrix_undirected(
        cluster_references_adjacency, k=3)
    DataLoader.save_data(cluster_references_adjacency, clustered_documents,
                         labels_subset)

    # Creates the graph and sets up the interactive webpage showing the graph
    visualization.doc_to_vec_visualize(documents=clustered_documents,
                                       adj_matrix=cluster_references_adjacency,
                                       labels=labels_subset)
Ejemplo n.º 5
0
def get_response_content(fs):
    # read the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # get ordered identifiers
    ordered_tip_name_id_pairs = list(
        sorted(set((node.get_name(), id(node)) for node in tree.gen_tips())))
    ordered_tip_names, ordered_tip_ids = zip(*ordered_tip_name_id_pairs)
    ordered_internal_ids = [
        id(node) for node in tree.preorder() if not node.is_tip()
    ]
    ordered_ids = list(ordered_tip_ids) + ordered_internal_ids
    # get the distance matrices
    full_D = tree.get_partial_distance_matrix(ordered_ids)
    partial_D = tree.get_partial_distance_matrix(ordered_tip_ids)
    # get the balaji matrices
    full_R = Clustering.get_R_balaji(full_D)
    partial_R = Clustering.get_R_balaji(partial_D)
    # Get the fiedler eigenvector and another eigenvector
    # for the full and the partial balaji matrices.
    full_va, full_vb = get_eigenvectors(full_R)
    partial_va, partial_vb = get_eigenvectors(partial_R)
    # create the response
    out = StringIO()
    print >> out, 'Fiedler vector associated with the graph'
    print >> out, 'for which the internal nodes are hidden:'
    print >> out, str(tuple(partial_va))
    print >> out
    print >> out, 'The tip subvector of the Fiedler vector'
    print >> out, 'associated with the graph of the full tree:'
    print >> out, str(tuple(full_va[:len(ordered_tip_ids)]))
    # write the response
    return out.getvalue()
Ejemplo n.º 6
0
def gap(X, X_pred, data, met, pjname, numS, ks, bound=(0.1,0.1)):
  # Generate reference dist of the origianl dataset
  nrefs = 10
  shape = data.shape
  refs = None
  if refs==None:
    tops = data.max(axis=0)
    bots = data.min(axis=0)
    dists = scipy.matrix(scipy.diag(tops-bots))
    rands = scipy.random.random_sample(size=(shape[0],shape[1],nrefs))
    for i in range(nrefs):
      rands[:,:,i] = rands[:,:,i]*dists+bots
  else:
    rands = refs
 
  # Calculate gap for each k
  res = []
  gaps = scipy.zeros((len(ks),))
  stds = np.zeros((len(ks),))
  gaps_d = scipy.zeros((len(ks),))
  stds_d = np.zeros((len(ks),))
  for (i,k) in enumerate(ks):
    print 'Start analysis on k=', k
    kmc, kml = Clustering.clustering(k, X, data, met)
    # Added procedure : calculate GP mean and var rather than cluster centers
    # Calculate distance by likelihood of Gaussian rather than Euclidean
    disp_d = sum([dst(data[m,:],kmc[kml[m],:]) for m in range(shape[0])])
    subname = 'K'+'{:02d}'.format(k)
    np.savetxt(pjname+'/'+subname+'_labels.txt', kml, fmt='%s', delimiter='\n')
    kmc, kmstd, kmc_long, kmstd_long, p = calCent(X, X_pred, data, pjname, subname, kml, numS, k, bound)
    res.append((kmc, kmstd, kmc_long, kmstd_long, p, kml))
    disp = sum([calDist_long(data[m,:], X, X_pred, kmc_long[kml[m],:], kmstd_long[kml[m],:], numS) for m in range(shape[0])])
    # Calculate reference gap
    refdisps = scipy.zeros((rands.shape[2],))
    refdisps_d = scipy.zeros((rands.shape[2],))
    for j in range(rands.shape[2]):
      kmc, kml = Clustering.clustering(k, X, rands[:,:,j], met)
      refdisps_d[j] = sum([dst(rands[m,:,j],kmc[kml[m],:]) for m in range(shape[0])])
      subname = 'R'+'{:02d}'.format(k)
      kmc, kmstd, kmc_long, kmstd_long, p = calCent(X, X_pred, rands[:,:,j], pjname, subname, kml, numS, k, bound)
      refdisps[j] = sum([calDist_long(rands[m,:,j], X, X_pred, kmc_long[kml[m],:], kmstd_long[kml[m],:], numS) for m in range(shape[0])])
    gaps[i] = scipy.log(disp)-scipy.mean(scipy.log(refdisps))
    stds[i] = np.std(scipy.log(refdisps))*np.sqrt(1+1/float(nrefs))
    gaps_d[i] = scipy.mean(scipy.log(refdisps_d))-scipy.log(disp_d)
    stds_d[i] = np.std(scipy.log(refdisps_d))*np.sqrt(1+1/float(nrefs))
    #print 'Gap(GP) for', k, 'is', scipy.log(disp), '-', scipy.mean(scipy.log(refdisps)), '=', gaps[i]
    #print 'Gap(ED) for', k, 'is', scipy.log(disp_d), '-', scipy.mean(scipy.log(refdisps_d)), '=', gaps_d[i]

  # Find the optimal k by std of log(refdisps)
  opt_i = optK2(ks, gaps, stds)
  opt_id = optK2(ks, gaps_d, stds_d)
  # Visualize gap statistics
  plt.errorbar(ks, gaps, yerr=stds)
  plt.errorbar(ks, gaps_d, yerr=stds_d)
  plt.savefig(pjname+'/GapStatistics.png')
  plt.close()
  pickle.dump((gaps, stds, gaps_d, stds_d), open(pjname+'/gaps.dump', 'w'))
  c, std, c_long, std_long, p, labels = res[opt_i]

  return c, std, c_long, std_long, p, ks[opt_i], labels
Ejemplo n.º 7
0
def process_tree(tree, tree_name, show_newick, show_art):
    """
    @param tree: a FelTree to be split by each method
    @param tree_name: a description of the tree
    @param show_newick: an output option
    @param show_art: an output option
    @return: a multi-line output string
    """
    out = StringIO()
    # be verbose if requested
    if show_newick:
        print >> out, 'newick representation of %s:' % tree_name
        print >> out, Newick.get_narrow_newick_string(tree, 80) 
    if show_art:
        print >> out, 'ascii art representation of %s:' % tree_name
        print >> out, get_art(tree)
    # cut the tree using each method
    ordered_names = list(sorted(node.get_name() for node in tree.gen_tips()))
    n = len(ordered_names)
    D = tree.get_distance_matrix(ordered_names)
    splitters = (Clustering.StoneExactDMS(), Clustering.StoneSpectralSignDMS())
    splitter_names = ('the +1 / -1 split criterion', 'the fiedler criterion')
    for splitter, splitter_name in zip(splitters, splitter_names):
        small_index_selection = splitter.get_selection(D)
        big_index_selection = set(range(n)) - small_index_selection
        names_a = list(sorted(ordered_names[i] for i in small_index_selection))
        names_b = list(sorted(ordered_names[i] for i in big_index_selection))
        print >> out, 'split inferred by %s:' % splitter_name
        print >> out, '{{%s}, {%s}}' % (', '.join(names_a), ', '.join(names_b))
    # return the string
    return out.getvalue()
Ejemplo n.º 8
0
def run_clustering_city(filepath, filename, k, eps, latitude, longitude):
    """
    The function clusters data for a given city and draws the result obtained on the map.
    :param filepath: path of file .csv
    :param filename: name of file .csv
    :param k: the value of k
    :param eps: the value of eps
    :param latitude: latitude of city
    :param longitude: longitude of city
    :return: None
    """
    d = Cluster.ClusterGreatCircles(filepath, filename)
    for k in [7]:
        for eps in [50]:
            c = Clustering.K_MXTGreatCircle(eps, k, d)
            c()
            m = Metrics.Modularity(c)
            print(f'k-MXT k={k} eps={eps} Modularity={m()}')
            c.cluster.view_at_map(latitude=latitude,
                                  longitude=longitude,
                                  filename_of_map=f'{k}-MXT-eps{eps}')
            c = Clustering.K_MXTGaussGreatCircle(eps, k, d)
            c()
            c.cluster.view_at_map(latitude=latitude,
                                  longitude=longitude,
                                  filename_of_map=f'{k}-MXTGauss-eps{eps}')
            m = Metrics.Modularity(c)
            print(f'k-MXT-Gauss k = {k} eps = {eps} Modularity = {m()}')
Ejemplo n.º 9
0
def getClusteringEvalPlots(dataset):
    noOfClusters = range(2, 11, 1)

    for ds in dataset:
        sse = [[]]
        sil = [[[], []]]
        scores = [[[], []], [[], []], [[], []], [[], []], [[], []]]
        for cluster in noOfClusters:
            kmLearner = Clustering.KM(n_clusters=cluster)
            kmLearner.getLearner().fit(ds.training_x)
            emLearner = Clustering.EM(n_components=cluster)
            emLearner.getLearner().fit(ds.training_x)
            clustringY_KM = kmLearner.getLearner().predict(ds.training_x)
            clustringY_EM = emLearner.getLearner().predict(ds.training_x)
            homogeneityKM, completenessKM, v_measureKM = homogeneity_completeness_v_measure(ds.training_y, clustringY_KM)
            AMISKM = adjusted_mutual_info_score(ds.training_y, clustringY_KM)
            ARSKM = adjusted_rand_score(ds.training_y, clustringY_KM)
            silhouetteKM = silhouette_score(ds.training_x, clustringY_KM)
            homogeneityEM, completenessEM, v_measureEM = homogeneity_completeness_v_measure(ds.training_y, clustringY_EM)
            AMISEM = adjusted_mutual_info_score(ds.training_y, clustringY_EM)
            ARSEM = adjusted_rand_score(ds.training_y, clustringY_EM)
            silhouetteEM = silhouette_score(ds.training_x, clustringY_EM)

            sse.append(kmLearner.getLearner().inertia_)
            sil[0][0].append(silhouetteKM)
            scores[0][0].append(v_measureKM)
            scores[1][0].append(AMISKM)
            scores[2][0].append(ARSKM)
            scores[3][0].append(homogeneityKM)

            sil[0][1].append(silhouetteEM)
            scores[0][1].append(v_measureEM)
            scores[1][1].append(AMISEM)
            scores[2][1].append(ARSEM)
            scores[3][1].append(homogeneityEM)

        plt.style.use('seaborn-whitegrid')
        plt.plot(noOfClusters, sil[0][0], label='Silhouette Score, KM', marker='o')
        plt.plot(noOfClusters, sil[0][1], label='Silhouette Score, EM', marker='o', linestyle='--')
        plt.ylabel('Silhouette Score', fontsize=12)
        plt.xlabel('K', fontsize=12)
        plt.title('Silhouette Plot for ' + ds.name, fontsize=12, y=1.03)
        plt.legend()
        plt.savefig('Figures/Clustering/Silhouette for ' + ds.name + '.png')
        plt.close()

        plt.style.use('seaborn-whitegrid')
        plt.plot(noOfClusters, scores[0][0], label='V Measure, KM', marker='o')
        plt.plot(noOfClusters, scores[1][0], label='Adj. Mutual Info, KM', marker='o')
        plt.plot(noOfClusters, scores[2][0], label='Adj. Rand. Score, KM', marker='o')
        plt.plot(noOfClusters, scores[0][1], label='V Measure, EM', marker='o', linestyle='--')
        plt.plot(noOfClusters, scores[1][1], label='Adj. Mutual Info, EM', marker='o', linestyle='--')
        plt.plot(noOfClusters, scores[2][1], label='Adj. Rand. Score, EM', marker='o', linestyle='--')
        plt.ylabel('Score', fontsize=12)
        plt.xlabel('K', fontsize=12)
        plt.title('Score Plot for ' + ds.name, fontsize=12, y=1.03)
        plt.legend()
        plt.savefig('Figures/Clustering/Score for ' + ds.name + '.png')
        plt.close()
Ejemplo n.º 10
0
def main():

    prefix_500 = "500"
    prefix_1000 = "1000"
    prefix_2000 = "2000"

    suffix_start_count = 1
    suffix_end_count = 10

    design_kmer_list = kmers.get_design_kmers()

    use_cluster_size_hard_stop = False

    for suffix_count in range(suffix_start_count, suffix_end_count + 1):

        input_file_name = prefix_500 + '_' + str(suffix_count)
        sequence_kmer_list = kmers.get_sequence_kmers(input_file_name)
        cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list,
                                                   design_kmer_list,
                                                   use_cluster_size_hard_stop)
        pickle.dump(
            cluster_dict,
            open("clusters_no_hard_stop" + input_file_name + ".p", "wb"))
        consensus_kmer_list = cluster_dict.keys()
        consensus_mapping_list = get_bipartite_matching(
            consensus_kmer_list, design_kmer_list)
        output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list,
                                     cluster_dict)

    for suffix_count in range(suffix_start_count, suffix_end_count + 1):
        input_file_name = prefix_1000 + '_' + str(suffix_count)
        sequence_kmer_list = kmers.get_sequence_kmers(input_file_name)
        cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list,
                                                   design_kmer_list,
                                                   use_cluster_size_hard_stop)
        pickle.dump(
            cluster_dict,
            open("clusters_no_hard_stop" + input_file_name + ".p", "wb"))
        consensus_kmer_list = cluster_dict.keys()
        consensus_mapping_list = get_bipartite_matching(
            consensus_kmer_list, design_kmer_list)
        output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list,
                                     cluster_dict)

    for suffix_count in range(suffix_start_count, suffix_end_count + 1):
        input_file_name = prefix_2000 + '_' + str(suffix_count)
        sequence_kmer_list = kmers.get_sequence_kmers(input_file_name)
        cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list,
                                                   design_kmer_list,
                                                   use_cluster_size_hard_stop)
        pickle.dump(
            cluster_dict,
            open("clusters_no_hard_stop" + input_file_name + ".p", "wb"))
        consensus_kmer_list = cluster_dict.keys()
        consensus_mapping_list = get_bipartite_matching(
            consensus_kmer_list, design_kmer_list)
        output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list,
                                     cluster_dict)
Ejemplo n.º 11
0
def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # read the criterion string, creating the splitter object
    if fs.exact:
        splitter = Clustering.StoneExactDMS()
    elif fs.sign:
        splitter = Clustering.StoneSpectralSignDMS()
    elif fs.nj:
        splitter = Clustering.NeighborJoiningDMS()
    elif fs.random:
        splitter = Clustering.RandomDMS()
    # read the original tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # define the maximum number of steps we want
    max_steps = 1000000
    # Make sure that the splitter object is appropriate
    # for the number of taxa and the number of tree reconstructions.
    ntaxa = len(list(tree.gen_tips()))
    if splitter.get_complexity(ntaxa) * fs.iterations > max_steps:
        msg_a = 'use a faster bipartition function, '
        msg_b = 'fewer taxa, or fewer tree reconstructions'
        raise HandlingError(msg_a + msg_b)
    # define the simulation parameters
    sim = Simulation(splitter, 'nj', 'cgi tree building simulation')
    sim.set_original_tree(tree)
    sim.set_step_limit(max_steps)
    # define an arbitrary but consistent ordering of the taxa
    ordered_names = [node.name for node in tree.gen_tips()]
    # attempt to simulate a bunch of distance matrices
    sampler = DMSampler.DMSampler(tree, ordered_names, fs.length)
    distance_matrices = []
    for result in sampler.gen_samples_or_none():
        # if a proposal was accepted then add it to the list
        if result:
            sequence_list, distance_matrix = result
            distance_matrices.append(distance_matrix)
        # if enough accepted samples have been generated then stop sampling
        remaining_acceptances = fs.iterations - len(distance_matrices)
        if not remaining_acceptances:
            break
        # If the remaining number of computrons is predicted
        # to be too much then stop.
        if sampler.get_remaining_computrons(remaining_acceptances) > max_steps:
            msg_a = 'this combination of parameters '
            msg_b = 'is predicted to take too long'
            raise HandlingError(msg)
    sim.run(distance_matrices, ordered_names)
    # define the response
    out = StringIO()
    print >> out, 'partition error count frequencies:'
    print >> out, sim.get_histogram_string()
    print >> out, ''
    print >> out, 'weighted partition errors:', sim.get_deep_loss()
    # return the response
    return out.getvalue()
def create_topic_clusters_and_map(restaurants,
                                  restaurant_ids_to_topics,
                                  my_map,
                                  lda,
                                  use_human_labels=True):
    data = Clustering.create_data_array(restaurants, restaurant_ids_to_topics,
                                        my_map)
    Clustering.plot_clusters(my_map, restaurants, restaurant_ids_to_topics,
                             data, lda)
Ejemplo n.º 13
0
def get_response_content(fs):
    # read the matrix
    D = fs.matrix
    if len(D) < 3:
        raise HandlingError('the matrix should have at least three rows')
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    if len(ordered_labels) != len(D):
        msg_a = 'the number of ordered labels should be the same '
        msg_b = 'as the number of rows in the matrix'
        raise HandlingError(msg_a + msg_b)
    if len(set(ordered_labels)) != len(ordered_labels):
        raise HandlingError('the ordered labels must be unique')
    # read the criterion string, creating the splitter object
    if fs.exact:
        splitter = Clustering.StoneExactDMS()
    elif fs.sign:
        splitter = Clustering.StoneSpectralSignDMS()
    elif fs.threshold:
        splitter = Clustering.StoneSpectralThresholdDMS()
    elif fs.nj:
        splitter = Clustering.NeighborJoiningDMS()
    # Make sure that the splitter object
    # is appropriate for the size of the distance matrix.
    if splitter.get_complexity(len(D)) > 1000000:
        msg = 'use a smaller distance matrix or a faster bipartition function'
        raise HandlingError(msg)
    # read the original tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    if len(ordered_labels) != len(list(tree.gen_tips())):
        msg_a = 'the number of ordered labels should be the same '
        msg_b = 'as the number of tips in the tree'
        raise HandlingError(msg_a + msg_b)
    tree_tip_names = set(tip.name for tip in tree.gen_tips())
    if tree_tip_names != set(ordered_labels):
        msg_a = 'the leaf labels of the tree do not match '
        msg_b = 'the ordered labels of the distance matrix rows'
        raise HandlingError(msg_a + msg_b)
    # create the tree builder
    tree_builder = NeighborhoodJoining.ValidatingTreeBuilder(
        D.tolist(), ordered_labels, splitter)
    # Read the recourse string and set the corresponding method
    # in the tree builder.
    if fs.njrecourse:
        tree_builder.set_fallback_name('nj')
    elif fs.halvingrecourse:
        tree_builder.set_fallback_name('halving')
    # define the response
    out = StringIO()
    # set parameters of the tree validating tree builder
    tree_builder.set_original_tree(tree)
    tree_builder.set_output_stream(out)
    tree = tree_builder.build()
    # return the response
    return out.getvalue()
Ejemplo n.º 14
0
def do_hard_coded_analysis_a(tree, tree_remark):
    """
    Do a hardcoded analysis of tree reconstruction methods.
    Make a bunch of R files.
    @param tree: a tree object
    @param tree_remark: a string that is a comment about the tree
    """
    # define an arbitrary order for the names of the leaves of the tree
    ordered_names = list(node.name for node in tree.gen_tips())
    # use 1000 replicates
    reconstruction_count = 1000
    # Make R files for reconstruction results
    # from sequences 100 and 500 nucleotides long.
    for sequence_length in (100, 500):
        # sample distance matrices
        print 'sampling', reconstruction_count, 'distance matrices'
        print 'from alignments of length', sequence_length
        sampler = DMSampler.DMSampler(tree, ordered_names, sequence_length)
        distance_matrices = []
        for result in sampler.gen_samples_or_none():
            # if the proposal was rejected then try again
            if not result:
                continue
            # add the accepted distance matrix sample to the list
            sequence_list, distance_matrix = result
            distance_matrices.append(distance_matrix)
            # stop when we have generated enough distance matrices
            if len(distance_matrices) == reconstruction_count:
                break
        # run both neighbor joining and spectral sign clustering
        sims = [
            Simulation(Clustering.NeighborJoiningDMS(), 'nj',
                       'neighbor joining'),
            Simulation(Clustering.StoneSpectralSignDMS(), 'nj',
                       'spectral sign')
        ]
        for sim in sims:
            print 'reconstructing', len(distance_matrices), 'trees'
            print 'using', sim.description
            sim.set_original_tree(tree)
            sim.run(distance_matrices, ordered_names)
        # consider the neighbor joining and the spectral sign results
        nj_sim, ss_sim = sims
        # write the uniform loss function comparison R script
        script_contents = R_helper(nj_sim.get_normalized_error_counts(),
                                   ss_sim.get_normalized_error_counts())
        filename = 'uniform_%d.R' % sequence_length
        with open(filename, 'w') as fout:
            print >> fout, script_contents
        # write the weighted loss function comparison R script
        script_contents = R_helper(nj_sim.get_normalized_loss_values(),
                                   ss_sim.get_normalized_loss_values())
        filename = 'weighted_%d.R' % sequence_length
        with open(filename, 'w') as fout:
            print >> fout, script_contents
Ejemplo n.º 15
0
def plotClusterWordCloudArray(articles, articleCentroidIds, Ks):
    fig, axes = plt.subplots(Ks.shape[0], Ks.shape[1], figsize=(12, 12))
    for i in range(Ks.shape[0]):
        for j in range(Ks.shape[1]):
            axes[i, j].imshow(
                createWordCloud(
                    Clustering.concatinateClusterTexts(articles,
                                                       articleCentroidIds,
                                                       Ks[i, j])))
            axes[i, j].axis("off")
            axes[i, j].set_title("Cluster " + str(Ks[i, j]) + "; count=" + str(
                Clustering.countClusterArticles(articles, articleCentroidIds,
                                                Ks[i, j])))
Ejemplo n.º 16
0
def create_clustering(documents, config):
    """Returns a clustering of the documents."""
    cluster_alg = config.get('clustering', 'cluster_algorithm')

    #start dbscan and infer the options
    if cluster_alg == 'dbscan' and not config.has_option('clustering','cluster_options'):
        clusters = Clustering.create_clusters_dbscan_infer_options(documents)
    #use given options
    else:
        cluster_options = json.loads(config.get('clustering','cluster_options'))
        clusters = Clustering.create_clusters(documents, cluster_alg,
                cluster_options)
    return clusters
Ejemplo n.º 17
0
 def initNodeLearningParams(self, AlgorithmChoice, AlgParams):
     self.AlgorithmChoice = AlgorithmChoice
     if AlgorithmChoice == 'Clustering':
         CentsPerLayer = AlgParams['NumCentsPerLayer']
         # InputWidth = InputWidths[LayerNum]
         if self.LayerNumber == 0:
             InputWidth = 48
         else:
             InputWidth = CentsPerLayer[self.LayerNumber - 1] * 4
         self.LearningAlgorithm = Clustering(
             AlgParams['mr'], AlgParams['vr'], AlgParams['sr'], InputWidth,
             AlgParams['NumCentsPerLayer'][self.LayerNumber],
             self.NodePosition)
     else:
         print('Only Incremental Clustering Exists')
Ejemplo n.º 18
0
def main():
    xlxs_to_maps = XlxsToMapsWrapper()
    xlxs_to_maps.getLatLngData()

    # Compute the cost of all Schools from all centers for clustering
    get_and_store_cost = GetAndStoreCost()
    get_and_store_cost.compute_cost_based_on_saved_text_file()
    # get_and_store_cost.compute_costs()
    # get_and_store_cost.compute_costs_thread_pool()

    # Identify the high level clusters
    clustering = Clustering()
    clustering.high_level_clustering()

    return
def main():
    xlxs_to_maps = XlxsToMapsWrapper()
    xlxs_to_maps.getLatLngData()

    # Compute the cost of all Schools from all centers for clustering
    get_and_store_cost = GetAndStoreCost()
    get_and_store_cost.compute_cost_based_on_saved_text_file()
    # get_and_store_cost.compute_costs()
    # get_and_store_cost.compute_costs_thread_pool()

    # Identify the high level clusters
    clustering = Clustering()
    clustering.high_level_clustering()

    return
Ejemplo n.º 20
0
def create_clustering(documents, config):
    """Returns a clustering of the documents."""
    cluster_alg = config.get('clustering', 'cluster_algorithm')

    #start dbscan and infer the options
    if cluster_alg == 'dbscan' and not config.has_option(
            'clustering', 'cluster_options'):
        clusters = Clustering.create_clusters_dbscan_infer_options(documents)
    #use given options
    else:
        cluster_options = json.loads(
            config.get('clustering', 'cluster_options'))
        clusters = Clustering.create_clusters(documents, cluster_alg,
                                              cluster_options)
    return clusters
def gold_ratio(browser):
    cee_result = Cee.color_element_from_url(browser)
    cl_list = []
    color_size_dict = {}
    for ce in cee_result:
        text_size_dict = ce[1].size
        cl_list.append(color_to_tuple(ce[2]))
        text_area = text_size_dict['height'] * text_size_dict['width']
        color_size_dict[color_to_tuple(ce[2])] = text_area
        back_size_dict = ce[3].size
        cl_list.append(color_to_tuple(ce[4]))
        back_area = back_size_dict['height'] * back_size_dict['width']
        color_size_dict[color_to_tuple(ce[4])] = back_area
    clust_result = Clust.Clustering(cl_list, 3)
    color_ratio = []
    for cr in clust_result:
        size_sum = 0
        for t in cr:
            size_sum += color_size_dict[t]
        color_ratio.append(size_sum)
    color_ratio.sort(reverse=True)
    # print(color_ratio)
    b_ratio = color_ratio[0] / color_ratio[2]
    s_ratio = color_ratio[1] / color_ratio[2]
    golden = (b_ratio - 6) / 9 + (s_ratio - 3) / 9
    return 1 - abs(golden)
Ejemplo n.º 22
0
def get_response_content(fs):
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # get the selected names
    selection = Util.get_stripped_lines(fs.selection.splitlines())
    selected_name_set = set(selection)
    possible_name_set = set(node.get_name() for node in tree.gen_tips())
    extra_names = selected_name_set - possible_name_set
    if extra_names:
        msg_a = 'the following selected names '
        msg_b = 'are not valid tips: %s' % str(tuple(extra_names))
        raise HandlingError(msg_a + msg_b)
    complement_name_set = possible_name_set - selected_name_set
    # assert that neither the selected name set nor its complement is empty
    if not selected_name_set or not complement_name_set:
        raise HandlingError('the selection is degenerate')
    # define an ordering on the tips
    ordered_names = [node.get_name() for node in tree.gen_tips()]
    # convert the selected names to a Y vector
    Y_as_list = []
    for name in ordered_names:
        if name in selected_name_set:
            value = 1
        else:
            value = -1
        Y_as_list.append(value)
    Y = np.array(Y_as_list)
    # get the distance matrix
    D = tree.get_distance_matrix(ordered_names)
    # get the R matrix
    R = Clustering.get_R_balaji(D)
    value = np.dot(np.dot(Y, R), Y.T)
    # return the taxon split evaluation
    return str(value) + '\n'
Ejemplo n.º 23
0
    def data_preprosessing(self, method='MeanShirf'):
        new_column = []

        if method == 'Uniform Distribution':
            for column in range(len(self.header_sublist)):
                self.new_data_frame[self.header_sublist[
                    column]] = self.data_preprosessing_uniform_distribution(
                        self.header_sublist[column], 8)
        elif method == 'Equal Steps':
            self.data_preprosessing_equal_range()
        else:  #'MeanShirf'
            for column in self.header_sublist:
                labled_column, cluster_centers = Clustering.meanShift(
                    self.data_frame[column].values)
                #####################################################################################
                #   we need cluster_centers for future prediction and clustering of a new song     #
                #####################################################################################
                self.new_data_frame[column] = labled_column

        #####################################
        for i in range(len(self.data_frame['song_popularity'])):
            if self.data_frame['song_popularity'][i] < 20:
                new_column.append(0)
            elif self.data_frame['song_popularity'][i] < 40:
                new_column.append(1)
            elif self.data_frame['song_popularity'][i] < 60:
                new_column.append(2)
            elif self.data_frame['song_popularity'][i] < 80:
                new_column.append(3)
            else:
                new_column.append(4)

        self.new_data_frame['song_popularity'] = new_column

        self.new_data_frame.to_csv(self.output_file_name)
Ejemplo n.º 24
0
def do_first_method(subtree_a, subtree_b,
        taxa_a1, taxa_a2, taxa_b1, taxa_b2, connecting_branch_length, out):
    # define the branch lengths of the reduced tree
    blen_a1, blen_a2, blen_ar = get_branch_length_equivalents(
            subtree_a, taxa_a1, taxa_a2)
    blen_b1, blen_b2, blen_br = get_branch_length_equivalents(
            subtree_b, taxa_b1, taxa_b2)
    # define the distance matrix of the reduced tree
    a, b = blen_a1, blen_a2
    c, d = blen_b1, blen_b2
    e = connecting_branch_length + blen_ar + blen_br
    reduced_D = [
            [0, a+b, a+e+c, a+e+d],
            [b+a, 0, b+e+c, b+e+d],
            [c+e+a, c+e+b, 0, c+d],
            [d+e+a, d+e+b, d+c, 0]]
    # get the R matrix of the reduced tree
    reduced_R = Clustering.get_R_balaji(reduced_D)
    print >> out, 'first method:'
    print >> out, 'equivalent subtree A:', '(a1:%f, a2:%f);' % (a, b)
    print >> out, 'equivalent subtree B:', '(b1:%f, b2:%f);' % (c, d)
    print >> out, 'equivalent connecting branch length:', e
    print >> out, 'M for the equivalent tree:'
    print >> out, MatrixUtil.m_to_string(reduced_R)
    print >> out
Ejemplo n.º 25
0
def trainData(data):
    path = os.getcwd()
    class1 = Clustering.cluster(path)
    data_seg = class1.preTreating(inptdata=data)
    for i in range(0, len(data_seg)):
        data_seg[i] = data_seg[i].split(" ")
    return data_seg
Ejemplo n.º 26
0
def get_response_content(fs):
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # get the selected names
    selection = Util.get_stripped_lines(fs.selection.splitlines())
    selected_name_set = set(selection)
    possible_name_set = set(node.get_name() for node in tree.gen_tips())
    extra_names = selected_name_set - possible_name_set
    if extra_names:
        msg_a = 'the following selected names '
        msg_b = 'are not valid tips: %s' % str(tuple(extra_names))
        raise HandlingError(msg_a + msg_b)
    complement_name_set = possible_name_set - selected_name_set
    # assert that neither the selected name set nor its complement is empty
    if not selected_name_set or not complement_name_set:
        raise HandlingError('the selection is degenerate')
    # define an ordering on the tips
    ordered_names = [node.get_name() for node in tree.gen_tips()]
    # convert the selected names to a Y vector
    Y_as_list = []
    for name in ordered_names:
        if name in selected_name_set:
            value = 1
        else:
            value = -1
        Y_as_list.append(value)
    Y = np.array(Y_as_list)
    # get the distance matrix
    D = tree.get_distance_matrix(ordered_names)
    # get the R matrix
    R = Clustering.get_R_balaji(D)
    value = np.dot(np.dot(Y, R), Y.T)
    # return the taxon split evaluation
    return str(value) + '\n'
Ejemplo n.º 27
0
def main():
    parser = OptionParser()
    
    parser.add_option("-t", "--test", dest="test", default=False, action="store_true",
                  help="Test the given set with the centroids of clusters")
    parser.add_option("-s", "--save", dest="saveCentroids", default=False, action="store_true",
                  help="Save the centroids of each cluster in a file")
    parser.add_option("-f", "--force", dest="forceMatrix", default=False, action="store_true",
                  help="Creates similarity matrix even if it exists")
    
    (options, args) = parser.parse_args()
    
    start_time = time.time()    
    files = os.listdir(SPN_PATH)

    if os.path.isfile(MATRIX_FILE): #Si no existe la matrix de correlaciones la genero
        H = np.load(MATRIX_FILE)
    else:
        H = Create_Similarity_Matrix(files)
        
    print 'Trabajando con matriz.... '+MATRIX_FILE 
    print 'Tiempo de creación de matrix: ', time.time() - start_time
    
    Clusters = clr.HierarchicalClustering(H, SIMILARITY_MSR)
    
    if options.test:
        Centroids = GetClusterCentroids(Clusters,files,options.saveCentroids)
        SimpleClustering(Clusters,Centroids,files)
    
    Eval = EvaluateClusters(Clusters,files)
    
    SaveOutput(Clusters,Eval,files)
    print "Numero total de clusters/camaras = "+str(len(Clusters))
    print time.time() - start_time, "seconds"        
Ejemplo n.º 28
0
def get_response_content(fs):
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # assert the the given labels are tips of the tree
    tip_name_set = set(node.get_name() for node in tree.gen_tips())
    user_name_set = set([fs.lhs_a, fs.lhs_b, fs.rhs_a, fs.rhs_b])
    bad_names = user_name_set - tip_name_set
    if bad_names:
        msg = 'these labels are not valid tips: %s' % ', '.join(bad_names)
        raise HandlingError(msg)
    # get the submatrix of the distance matrix
    ordered_names = list(sorted(node.get_name() for node in tree.gen_tips()))
    D = np.array(tree.get_distance_matrix(ordered_names))
    # get the response matrix
    R = Clustering.get_R_stone(D)
    # get the two by two matrix
    name_to_index = dict((name, i) for i, name in enumerate(ordered_names))
    R_reduced = np.zeros((2, 2))
    la = name_to_index[fs.lhs_a]
    lb = name_to_index[fs.lhs_b]
    ra = name_to_index[fs.rhs_a]
    rb = name_to_index[fs.rhs_b]
    R_reduced[0][0] = R[la][ra]
    R_reduced[0][1] = R[la][rb]
    R_reduced[1][0] = R[lb][ra]
    R_reduced[1][1] = R[lb][rb]
    epsilon = 1e-13
    criterion = np.linalg.det(R_reduced)
    if abs(criterion) < epsilon:
        criterion = 0
    # in analogy to the four point condition, use two different ways of calculating the distance
    blen_a = (D[la][rb] + D[lb][ra] - D[la][lb] - D[ra][rb]) / 2.0
    blen_b = (D[la][ra] + D[lb][rb] - D[la][lb] - D[ra][rb]) / 2.0
    blen = min(blen_a, blen_b)
    # define the response
    out = StringIO()
    paragraphs = []
    if fs.show_response:
        paragraph = [
            'response matrix with rows ordered alphabetically by leaf label:',
            MatrixUtil.m_to_string(R)
        ]
        paragraphs.append(paragraph)
    if fs.show_reduced_response:
        paragraph = [
            '2x2 submatrix of the response matrix:',
            MatrixUtil.m_to_string(R_reduced)
        ]
        paragraphs.append(paragraph)
    if True:
        paragraph = [
            'determinant of the 2x2 submatrix of the response matrix:',
            str(criterion)
        ]
        paragraphs.append(paragraph)
    if fs.show_blen:
        paragraph = ['branch length defined by the split:', str(blen)]
        paragraphs.append(paragraph)
    # return the response
    return '\n\n'.join('\n'.join(p) for p in paragraphs) + '\n'
Ejemplo n.º 29
0
def textInput(text):
    #test_test = "CRM宽带资源的地址在哪里"
    path = os.getcwd()
    class1 = Clustering.cluster(path)
    sentence = class1.preTreating(inptdata=text)
    sentence = sentence[0].split(" ")
    return sentence
Ejemplo n.º 30
0
    def performDBSCAN(self):
        if (len(self.smallGoldParticleCoordinates) < 1):
            print 'DBSCAN cannot be performed, not enough data points'
        else:
            epsilonInPixel = self.DBSCANEpsilonDoubleSpinBox.value(
            ) * nmPixelRatio

            # perform DBSCAN
            array, labels = Clustering.dbscanClustering(
                self.smallGoldParticleCoordinates, [], [], epsilonInPixel,
                self.DBSCANMinSampleDoubleSpinBox.value())

            # appending X and Y coordinates of the localization points to vectors to save them with the cluster IDs (labels)
            xVec = []
            yVec = []
            for i in range(0, len(array)):
                xVec.append(array[i][0])
                yVec.append(array[i][1])

            # appending vectors to a matrix
            dataMatrix = []
            dataMatrix.append(xVec)
            dataMatrix.append(yVec)
            dataMatrix.append(labels)

            # transpose the matrix to have the data columnwise and save it with a '_DBSCAN' suffix
            dataMatrix = np.transpose(np.asarray(dataMatrix))
            filenameToSave = str(
                openedFilename[0:-4]) + '_DBSCAN_epsilon=' + str(
                    int(self.DBSCANEpsilonDoubleSpinBox.value())
                ) + 'nm_minSample=' + str(
                    int(self.DBSCANMinSampleDoubleSpinBox.value())) + '.txt'
            np.savetxt(filenameToSave, dataMatrix, fmt='%0.0f')
Ejemplo n.º 31
0
 def target_plot(self, label: str, anomalies=False):
     labels = self.target[label].values
     n_labels = max(pd.unique(labels).max(), len(pd.unique(labels))) + 1
     return Clustering(labels=labels,
                       n_clusters=n_labels,
                       title=label.capitalize() + ' True Labels',
                       palette=self.palette)
Ejemplo n.º 32
0
def get_response_content(fs):
    # read the matrix
    D = fs.matrix
    if len(D) < 3:
        raise HandlingError('the matrix should have at least three rows')
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(fs.labels.splitlines())
    if len(ordered_labels) != len(D):
        msg_a = 'the number of ordered labels should be the same '
        msg_b = 'as the number of rows in the matrix'
        raise HandlingError(msg_a + msg_b)
    # create the tree building object
    splitter = Clustering.StoneExactDMS()
    tree_builder = NeighborhoodJoining.TreeBuilder(D.tolist(), ordered_labels,
                                                   splitter)
    # Read the recourse string and set the corresponding method
    # in the tree builder.
    recourse_string = fs.getfirst('recourse')
    if fs.njrecourse:
        tree_builder.set_fallback_name('nj')
    elif fs.halvingrecourse:
        tree_builder.set_fallback_name('halving')
    # assert that the computation will not take too long
    if tree_builder.get_complexity() > 1000000:
        raise HandlingError('this computation would take too long')
    # build the tree
    tree = tree_builder.build()
    # return the response
    return NewickIO.get_newick_string(tree) + '\n'
Ejemplo n.º 33
0
def get_response_content(fs):
    # get the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # assert the the given labels are tips of the tree
    tip_name_set = set(node.get_name() for node in tree.gen_tips())
    user_name_set = set([fs.lhs_a, fs.lhs_b, fs.rhs_a, fs.rhs_b])
    bad_names = user_name_set - tip_name_set
    if bad_names:
        msg = 'these labels are not valid tips: %s' % ', '.join(bad_names)
        raise HandlingError(msg)
    # get the submatrix of the distance matrix
    ordered_names = list(sorted(node.get_name() for node in tree.gen_tips()))
    D = np.array(tree.get_distance_matrix(ordered_names))
    # get the response matrix
    R = Clustering.get_R_stone(D)
    # get the two by two matrix
    name_to_index = dict((name, i) for i, name in enumerate(ordered_names))
    R_reduced = np.zeros((2,2))
    la = name_to_index[fs.lhs_a]
    lb = name_to_index[fs.lhs_b]
    ra = name_to_index[fs.rhs_a]
    rb = name_to_index[fs.rhs_b]
    R_reduced[0][0] = R[la][ra]
    R_reduced[0][1] = R[la][rb]
    R_reduced[1][0] = R[lb][ra]
    R_reduced[1][1] = R[lb][rb]
    epsilon = 1e-13
    criterion = np.linalg.det(R_reduced)
    if abs(criterion) < epsilon:
        criterion = 0
    # in analogy to the four point condition, use two different ways of calculating the distance
    blen_a = (D[la][rb] + D[lb][ra] - D[la][lb] - D[ra][rb]) / 2.0
    blen_b = (D[la][ra] + D[lb][rb] - D[la][lb] - D[ra][rb]) / 2.0
    blen = min(blen_a, blen_b)
    # define the response
    out = StringIO()
    paragraphs = []
    if fs.show_response:
        paragraph = [
                'response matrix with rows ordered alphabetically by leaf label:',
                MatrixUtil.m_to_string(R)]
        paragraphs.append(paragraph)
    if fs.show_reduced_response:
        paragraph = [
                '2x2 submatrix of the response matrix:',
                MatrixUtil.m_to_string(R_reduced)]
        paragraphs.append(paragraph)
    if True:
        paragraph = [
                'determinant of the 2x2 submatrix of the response matrix:',
                str(criterion)]
        paragraphs.append(paragraph)
    if fs.show_blen:
        paragraph = [
                'branch length defined by the split:',
                str(blen)]
        paragraphs.append(paragraph)
    # return the response
    return '\n\n'.join('\n'.join(p) for p in paragraphs) + '\n'
Ejemplo n.º 34
0
 def __init__(self, data):
     path = os.getcwd()
     class1 = Clustering.cluster(path)
     result = class1.kmeans(data=data)
     self.clus_result = result[0]
     self.weight = result[1]
     self.tfidf_model = result[2:5]
     self.class1 = class1
Ejemplo n.º 35
0
class Node:
    def __init__(self,
                 LayerNumber,
                 NodePos,
                 cifarstat={
                     'patch_mean': [],
                     'patch_std': [],
                     'whiten_mat': []
                 }):
        self.LayerNumber = LayerNumber
        self.NodePosition = NodePos
        self.Belief = []
        #cifarStat = load_cifar(4)# to be used for Normalization and Whitening Purposes
        self.patch_mean = cifarstat['patch_mean']
        self.patch_std = cifarstat['patch_std']
        self.v = cifarstat['whiten_mat']

    def initNodeLearningParams(self, AlgorithmChoice, AlgParams):
        self.AlgorithmChoice = AlgorithmChoice
        if AlgorithmChoice == 'Clustering':
            CentsPerLayer = AlgParams['NumCentsPerLayer']
            # InputWidth = InputWidths[LayerNum]
            if self.LayerNumber == 0:
                InputWidth = 48
            else:
                InputWidth = CentsPerLayer[self.LayerNumber - 1] * 4
            self.LearningAlgorithm = Clustering(
                AlgParams['mr'], AlgParams['vr'], AlgParams['sr'], InputWidth,
                AlgParams['NumCentsPerLayer'][self.LayerNumber],
                self.NodePosition)
        else:
            print('Only Incremental Clustering Exists')

    def loadInput(self, In):
        if self.LayerNumber == 0:
            In = In - self.patch_mean
            In = In / self.patch_std
            In = In.dot(self.v)
        self.Input = In

    def doNodeLearning(self, Mode):
        if self.AlgorithmChoice == 'Clustering':
            self.LearningAlgorithm.update_node(self.Input, Mode)
            self.Belief = self.LearningAlgorithm.belief
        else:
            print("Only Incremental Clustering Algorithm Exists")
Ejemplo n.º 36
0
def main():

    prefix_500 = "500"
    prefix_1000 = "1000"
    prefix_2000 = "2000"

    suffix_start_count = 1
    suffix_end_count = 10

    design_kmer_list = kmers.get_design_kmers()

    use_cluster_size_hard_stop = False

    for suffix_count in range(suffix_start_count, suffix_end_count + 1):

        input_file_name = prefix_500 + '_' + str(suffix_count)
        sequence_kmer_list = kmers.get_sequence_kmers(input_file_name)
        cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list,
                                                   design_kmer_list,
                                                   use_cluster_size_hard_stop)
        pickle.dump(cluster_dict, open("clusters_no_hard_stop" + input_file_name + ".p", "wb"))
        consensus_kmer_list = cluster_dict.keys()
        consensus_mapping_list = get_bipartite_matching(consensus_kmer_list, design_kmer_list)
        output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list, cluster_dict)

    for suffix_count in range(suffix_start_count, suffix_end_count + 1):
        input_file_name = prefix_1000 + '_' + str(suffix_count)
        sequence_kmer_list = kmers.get_sequence_kmers(input_file_name)
        cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list,
                                                   design_kmer_list,
                                                   use_cluster_size_hard_stop)
        pickle.dump(cluster_dict, open("clusters_no_hard_stop" + input_file_name + ".p", "wb"))
        consensus_kmer_list = cluster_dict.keys()
        consensus_mapping_list = get_bipartite_matching(consensus_kmer_list, design_kmer_list)
        output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list, cluster_dict)

    for suffix_count in range(suffix_start_count, suffix_end_count + 1):
        input_file_name = prefix_2000 + '_' + str(suffix_count)
        sequence_kmer_list = kmers.get_sequence_kmers(input_file_name)
        cluster_dict = Clustering.get_cluster_dict(sequence_kmer_list,
                                                   design_kmer_list,
                                                   use_cluster_size_hard_stop)
        pickle.dump(cluster_dict, open("clusters_no_hard_stop" + input_file_name + ".p", "wb"))
        consensus_kmer_list = cluster_dict.keys()
        consensus_mapping_list = get_bipartite_matching(consensus_kmer_list, design_kmer_list)
        output_seq_kmer_mapping_list(input_file_name, consensus_mapping_list, cluster_dict)
Ejemplo n.º 37
0
def transform_c(L, D_partial):
    """
    The matrices are ordered with leaves before internal nodes.
    @param L: full tree laplacian matrix
    @param D_partial: distances between leaves
    @return: the S matrix
    """
    M = Clustering.get_R_stone(D_partial)
    S = -2*M
    return S
Ejemplo n.º 38
0
 def initNodeLearningParams(self, AlgorithmChoice, AlgParams):
     self.AlgorithmChoice = AlgorithmChoice
     if AlgorithmChoice == 'Clustering':
         CentsPerLayer = AlgParams['NumCentsPerLayer']
         # InputWidth = InputWidths[LayerNum]
         if self.LayerNumber == 0:
             InputWidth = 48
         else:
             InputWidth = CentsPerLayer[self.LayerNumber-1] * 4
         self.LearningAlgorithm = Clustering(AlgParams['mr'], AlgParams['vr'], AlgParams['sr'], InputWidth,
                                             AlgParams['NumCentsPerLayer'][self.LayerNumber], self.NodePosition)
     else:
         print('Only Incremental Clustering Exists')
Ejemplo n.º 39
0
def getMaxSimilarityForNJ(d, seq):
    Q = {}
    n = len(seq)
    for (i, j) in d:
        sumI = 0
        sumJ = 0
        for k in seq:
            if i != k:
                sumI += d[i, k]
            if j != k:
                sumJ += d[j, k]
        Q[i, j] = (n - 2) * d[i, j] - sumI - sumJ
    maxSimilarity = Clustering.getMax(Q)
    return maxSimilarity
Ejemplo n.º 40
0
class Node:
    def __init__(self, LayerNumber, NodePos, cifarstat={'patch_mean':[],'patch_std':[],'whiten_mat':[]}):
        self.LayerNumber = LayerNumber
        self.NodePosition = NodePos
        self.Belief = []
        #cifarStat = load_cifar(4)# to be used for Normalization and Whitening Purposes
        self.patch_mean = cifarstat['patch_mean']
        self.patch_std = cifarstat['patch_std']
        self.v = cifarstat['whiten_mat']

    def initNodeLearningParams(self, AlgorithmChoice, AlgParams):
        self.AlgorithmChoice = AlgorithmChoice
        if AlgorithmChoice == 'Clustering':
            CentsPerLayer = AlgParams['NumCentsPerLayer']
            # InputWidth = InputWidths[LayerNum]
            if self.LayerNumber == 0:
                InputWidth = 48
            else:
                InputWidth = CentsPerLayer[self.LayerNumber-1] * 4
            self.LearningAlgorithm = Clustering(AlgParams['mr'], AlgParams['vr'], AlgParams['sr'], InputWidth,
                                                AlgParams['NumCentsPerLayer'][self.LayerNumber], self.NodePosition)
        else:
            print('Only Incremental Clustering Exists')

    def loadInput(self, In):
        if self.LayerNumber == 0:
            In = In - self.patch_mean
            In = In/self.patch_std
            In = In.dot(self.v)
        self.Input = In

    def doNodeLearning(self, Mode):
        if self.AlgorithmChoice == 'Clustering':
            self.LearningAlgorithm.update_node(self.Input, Mode)
            self.Belief = self.LearningAlgorithm.belief
        else:
            print("Only Incremental Clustering Algorithm Exists")
Ejemplo n.º 41
0
def get_response_content(fs):
    # start writing the response type
    response_headers = []
    # get the processing options
    use_internal_nodes = fs.internal
    # read the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # get the ordered ids and ordered names of the nodes in the tree
    ordered_name_id_pairs = []
    for node in tree.preorder():
        # define the name of the node
        name = ''
        if node.is_tip():
            name = node.get_name()
        # possibly add the node
        if use_internal_nodes:
            ordered_name_id_pairs.append((name, id(node)))
        elif node.is_tip():
            ordered_name_id_pairs.append((name, id(node)))
    ordered_ids = [id_ for name, id_ in ordered_name_id_pairs]
    ordered_names = [name for name, id_ in ordered_name_id_pairs]
    #raise HandlingError('debug: ' + str(ordered_names))
    id_to_index = dict((id_, i) for i, id_ in enumerate(ordered_ids))
    # get the incidence matrix for drawing lines
    n = len(ordered_ids)
    incidence_matrix = [[0]*n for i in range(n)]
    if use_internal_nodes:
        for node in tree.preorder():
            for child in node.gen_children():
                parent_id = id_to_index[id(node)]
                child_id = id_to_index[id(child)]
                incidence_matrix[parent_id][child_id] = 1
                incidence_matrix[child_id][parent_id] = 1
    # get the R matrix from the tree; this is -1/2 times the laplacian matrix
    if use_internal_nodes:
        D = tree.get_full_distance_matrix(ordered_ids)
    else:
        D = tree.get_distance_matrix(ordered_names)
    R_matrix = Clustering.get_R_balaji(D)
    # draw the image
    try:
        ext = Form.g_imageformat_to_ext[fs.imageformat]
        image_size = (640, 480)
        return get_image(R_matrix, incidence_matrix, ordered_names,
                image_size, ext, fs.axes, fs.connections)
    except CairoUtil.CairoUtilError as e:
        raise HandlingError(e)
Ejemplo n.º 42
0
def get_response_content(fs):
    # read the weighted adjacency matrix
    A = fs.matrix
    # read the labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    # Assert that the number of labels
    # is compatible with the shape of the matrix.
    n = len(A)
    if len(ordered_labels) != n:
        msg = 'the number of labels does not match the size of the matrix'
        raise HandlingError(msg)
    # get the best objective function value and the corresponding best cluster
    if fs.conductance:
        max_size = 20
        if n > max_size:
            msg_a = 'for the min conductance objective function please '
            msg_b = 'limit the size of the matrix to %d rows' % max_size
            raise HandlingError(msg_a + msg_b)
        pairs = [(get_conductance(assignment, A), assignment)
                for assignment in Clustering.gen_assignments(n)]
        best_objective, best_assignment = min(pairs)
        best_cluster = set(i for i in range(n) if best_assignment[i] == 1)
    if fs.min:
        best_cluster = StoerWagner.stoer_wagner_min_cut(A)
        complement = set(range(n)) - best_cluster
        best_objective = sum(A[i][j] for i in best_cluster for j in complement)
    # get the smaller of the two clusters
    complement = set(range(n)) - best_cluster
    small_cluster = min((len(best_cluster), best_cluster),
            (len(complement), complement))[1]
    # start to prepare the reponse
    out = StringIO()
    print >> out, 'smallest cluster defined by the bipartition:'
    for index in sorted(small_cluster):
        print >> out, ordered_labels[index]
    print >> out, ''
    print >> out, 'objective function value:'
    print >> out, best_objective
    # write the response
    return out.getvalue()
Ejemplo n.º 43
0
def get_branch_length_equivalents(tree, first_taxa, second_taxa):
    """
    @param tree: a newick tree
    @param first_taxa: a set of tip names
    @param second_taxa: another set of tip names
    @return: a triple (first distance, second distance, root distance)
    """
    # get the root-augmented distance matrices
    D_aug = get_root_augmented_distance_matrix(tree, first_taxa, second_taxa)
    # get the R matrix
    R_aug = Clustering.get_R_balaji(D_aug)
    # Get the matrix whose elements are block element sums
    # of the root-augmented R matrix.
    block_structure = [0]*len(first_taxa) + [1]*len(second_taxa) + [2]
    B = [[0]*3 for i in range(3)]
    for i, block_i in enumerate(block_structure):
        for j, block_j in enumerate(block_structure):
            B[block_i][block_j] += R_aug[i][j]
    # get the new branch lengths for the subtree
    denominator = 2 * (B[0][1]*B[1][2] + B[1][2]*B[2][0] + B[2][0]*B[0][1])
    blen_first = B[1][2] / denominator
    blen_second = B[2][0] / denominator
    blen_root = B[0][1] / denominator
    return blen_first, blen_second, blen_root
Ejemplo n.º 44
0
def do_second_method(tree, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out):
    # get the covariance matrix
    ordered_names = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2
    cov = np.array(tree.get_covariance_matrix(ordered_names))
    # invert the covariance matrix to make the precision matrix
    prec = np.linalg.inv(cov)
    # take the block sums of the precision matrix
    block_structure = get_block_structure(taxa_a1, taxa_a2, taxa_b1, taxa_b2)
    name_order = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2
    block_prec = [[0]*4 for i in range(4)]
    for i, block_i in enumerate(block_structure):
        for j, block_j in enumerate(block_structure):
            block_prec[block_i][block_j] += prec[i][j]
    # invert the block summed precision matrix
    reduced_cov = np.linalg.inv(np.array(block_prec))
    # extract the branch lengths from the reduced covariance matrix
    a = reduced_cov[0][0] - reduced_cov[0][1]
    b = reduced_cov[1][1] - reduced_cov[0][1]
    c = reduced_cov[2][2] - reduced_cov[2][3]
    d = reduced_cov[3][3] - reduced_cov[2][3]
    e = reduced_cov[0][1] + reduced_cov[2][3]
    # define the distance matrix for the reduced tree
    reduced_D = [
            [0, a+b, a+e+c, a+e+d],
            [b+a, 0, b+e+c, b+e+d],
            [c+e+a, c+e+b, 0, c+d],
            [d+e+a, d+e+b, d+c, 0]]
    # get the R matrix of the reduced tree
    reduced_R = Clustering.get_R_balaji(reduced_D)
    print >> out, 'second method:'
    print >> out, 'equivalent subtree A:', '(a1:%f, a2:%f);' % (a, b)
    print >> out, 'equivalent subtree B:', '(b1:%f, b2:%f);' % (c, d)
    print >> out, 'equivalent connecting branch length:', e
    print >> out, 'M for the equivalent tree:'
    print >> out, MatrixUtil.m_to_string(reduced_R)
    print >> out
def run(my_map, reviews, restaurants):
    restaurants = Clustering.filter_restaurants(restaurants)
    normalized_restaurant_ids_to_topics, lda = Clustering.get_predictions(my_map, reviews, restaurants)
    create_gaussian_clusters_and_map(restaurants, normalized_restaurant_ids_to_topics, my_map, lda)   
Ejemplo n.º 46
0
		#print data about each category of videos
		# print 'ID: '+ str(vidCat)
		# if 'en' in miniLangList.keys():
			
		# 	sent = sentimentAnalyzer(miniLangList['en'])
		# 	print 'Average Polarity: {:1.5f}'.format(sent[0])
		# 	print 'Average Extremity: {:1.5f}'.format(sent[1])
		# 	freqs = wordFrequency(miniLangList['en'])
		# 	print 'Top 20:'
		# 	print freqs[0:20]
		# 	spell = spellCheck(miniLangList['en'])
		# 	print 'Spelled Right: {}\nSpelled Wrong: {}'.format(spell[0], spell[1])#,spell[2])
		# 	print[(k, len(miniLangList[k])) for k in sorted(miniLangList.keys())]
		# 	print ''
	print 'ALL'
	sent = sentimentAnalyzer(masterList['en'])
	print 'Average Polarity: {:1.5f}'.format(sent[0])
	print 'Average Extremity: {:1.5f}'.format(sent[1])
	freqs = wordFrequency(masterList['en'])
	print 'Top 20:'
	print freqs[0:100]
	spell = spellCheck(masterList['en'])
	print 'Spelled Right: {}\nSpelled Wrong: {}'.format(spell[0], spell[1])#,spell[2])
	print 'Percentage Mispelled: {}'.format(spell[1]*100.0/(spell[0]+spell[1]))
	langs = [(k, len(masterList[k])) for k in masterList.keys()]
	print sorted(langs, key=lambda x:x[1],reverse=True)

	Clustering.kMeans(masterList['en'], numClusters=10, numDefiningWords=15)
	#Clustering.dumbClustering(masterList['en'])
Ejemplo n.º 47
0
def getMaxSimilarityForUPGMA(d):
    maxSimilarity = Clustering.getMax(d)
    return maxSimilarity
Ejemplo n.º 48
0
 datawriter = csv.writer(dfile, delimiter=',')
 labelwriter = csv.writer(lfile, delimiter=',')
 infowriter = csv.writer(ifile, delimiter=',')
 lcount = 1 #line count
 
 # get test name into a dictionary
 test_dict = testprocessing.get_test(reader,start_col)
 rfile.seek(0)
 # get diagnostic info into a dictionary
 diag_dict = testprocessing.read_labtestcvs("LabTestsInfo.csv", test_dict)
 # get the latest lab test result for every patient
 latest_test_data = testprocessing.convert_test_line(reader,test_dict,start_col)
 rfile.seek(0)
 
 # Get the class label for each state with respect to patient's health states
 classification = Clustering.runKmeans(state_file)
 
 # append a empty row to the first row 
 infowriter.writerow([])
 datawriter.writerow([])
 labelwriter.writerow([])
 
 for index, row in enumerate(reader):
     # if lcount < 20:
     if proc_train:
         convert_line(classification[index],latest_test_data[index], diag_dict, row, datawriter, labelwriter, infowriter, True)
         '''
         if index >20:
             break
         '''
     else:
Ejemplo n.º 49
0
def get_response_content(fs):
    # read the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # create a putative list of nodes
    putative_nodes = []
    putative_nodes.extend(list(tree.gen_tips()))
    if not fs.standard:
        putative_nodes.extend(list(tree.gen_internal_nodes()))
    # get the ordered ids and ordered names of the selected nodes in the tree
    ordered_name_id_pairs = []
    for node in putative_nodes:
        name = node.get_name()
        if fs.named and not name:
            continue
        ordered_name_id_pairs.append((name, id(node)))
    ordered_ids = [id_ for name, id_ in ordered_name_id_pairs]
    ordered_names = [name for name, id_ in ordered_name_id_pairs]
    id_to_index = dict((id_, i) for i, id_ in enumerate(ordered_ids))
    # assert that names are non-empty
    for name in ordered_names:
        if not name:
            raise HandlingError("each node must be named")
    # assert that names are unique
    n = len(ordered_ids)
    if len(set(ordered_names)) != n:
        raise HandlingError("each node must be uniquely named")
    # get the R matrix from the tree; this is -1/2 times the laplacian matrix
    if fs.standard:
        D = tree.get_distance_matrix(ordered_names)
    elif fs.augmented:
        D = tree.get_full_distance_matrix(ordered_ids)
    elif fs.named:
        D = tree.get_partial_distance_matrix(ordered_ids)
    R = Clustering.get_R_balaji(D)
    R_trace = sum(R[i][i] for i in range(n))
    # get the best partition and partition value
    value_Y_pairs = []
    for Y in Clustering.gen_assignments(n):
        value = Clustering.get_exact_criterion(R, Y)
        value_Y_pairs.append((value, Y))
    best_value, best_Y = max(value_Y_pairs)
    # convert the best Y vector to a partition
    pos_set = set(ordered_names[i] for i, el in enumerate(best_Y) if el > 0)
    neg_set = set(ordered_names[i] for i, el in enumerate(best_Y) if el < 0)
    # get fiedler split information
    fiedler_eigenvector = get_eigenvector_of_interest(R)
    fiedler_pos_set = set(ordered_names[i] for i, elem in enumerate(fiedler_eigenvector) if elem > 0)
    fiedler_neg_set = set(ordered_names[i] for i, elem in enumerate(fiedler_eigenvector) if elem < 0)
    # write the paragraphs
    paragraphs = []
    if fs.show_split:
        lines = [
            "exact criterion partition:",
            str(list(best_Y)),
            set_to_string((set_to_string(neg_set), set_to_string(pos_set))),
        ]
        paragraphs.append("\n".join(lines))
    if fs.show_value:
        lines = ["exact criterion value:", str(best_value)]
        paragraphs.append("\n".join(lines))
    if fs.show_value_minus_trace:
        lines = ["exact criterion value minus trace:", str(best_value - R_trace)]
        paragraphs.append("\n".join(lines))
    if fs.show_fiedler_split:
        lines = [
            "spectral sign partition:",
            set_to_string((set_to_string(fiedler_neg_set), set_to_string(fiedler_pos_set))),
        ]
        paragraphs.append("\n".join(lines))
    if fs.show_fiedler_eigenvector:
        lines = ["eigenvector of interest:", str(list(fiedler_eigenvector))]
        paragraphs.append("\n".join(lines))
    if fs.show_labels:
        lines = ["ordered labels:"] + ordered_names
        paragraphs.append("\n".join(lines))
    if fs.show_distance_matrix:
        if fs.augmented:
            title = "augmented distance matrix:"
        elif fs.standard:
            title = "distance matrix:"
        elif fs.named:
            title = "distance matrix:"
        lines = [title, MatrixUtil.m_to_string(D)]
        paragraphs.append("\n".join(lines))
    if fs.show_M_matrix:
        lines = ["M matrix:", MatrixUtil.m_to_string(R)]
        paragraphs.append("\n".join(lines))
    # return the response
    return "\n\n".join(paragraphs) + "\n"
Ejemplo n.º 50
0
def run(my_map, reviews, restaurants):
    restaurants = Clustering.filter_restaurants(restaurants, reviews)
    normalized_restaurant_ids_to_topics, lda = Clustering.get_predictions(my_map, reviews, restaurants)
    elbow_clustering(restaurants, normalized_restaurant_ids_to_topics, my_map)   
Ejemplo n.º 51
0
def elbow_clustering(restaurants, restaurant_ids_to_topics, my_map):
    data = Clustering.create_data_array(restaurants, restaurant_ids_to_topics, my_map)
    print "starting elbow clustering"
    ElbowClustering.plot_elbow_and_gap(data)
Ejemplo n.º 52
0
def get_response_content(fs):
    # read the values from the form
    subtree_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree)
    taxa_a1 = Util.get_stripped_lines(StringIO(fs.taxa_a1))
    taxa_a2 = Util.get_stripped_lines(StringIO(fs.taxa_a2))
    subtree_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree)
    taxa_b1 = Util.get_stripped_lines(StringIO(fs.taxa_b1))
    taxa_b2 = Util.get_stripped_lines(StringIO(fs.taxa_b2))
    connecting_branch_length = fs.blen
    # assert that no group of taxa contains duplicates
    for taxa in (taxa_a1, taxa_a2, taxa_b1, taxa_b2):
        if len(set(taxa)) != len(taxa):
            raise HandlingError('one of the lists of taxa contains duplicates')
    # assert that each subtree has at least two tips and no duplicates
    for tree in (subtree_a, subtree_b):
        tip_names = list(node.get_name() for node in tree.gen_tips())
        if len(tip_names) < 2:
            raise HandlingError('each subtree should have at least two tips')
        if len(set(tip_names)) != len(tip_names):
            raise HandlingError('a subtree has duplicate tip names')
    # assert that the partitions are valid
    first_group = ('A', subtree_a, taxa_a1, taxa_a2) 
    second_group = ('B', subtree_b, taxa_b1, taxa_b2)
    for tree_name, tree, taxa_1, taxa_2 in (first_group, second_group):
        tip_names = set(node.get_name() for node in tree.gen_tips())
        for group_name, taxa in (('1', taxa_1), ('2', taxa_2)):
            nonsense_names = list(set(taxa) - set(tip_names))
            msg_a = 'the following taxa in group %s ' % group_name
            msg_b = 'of subtree %s ' % tree_name
            msg_c = 'are not valid tips: %s' % str(nonsense_names)
            message = msg_a + msg_b + msg_c
            if nonsense_names:
                raise HandlingError(message)
        if set(taxa_1) & set(taxa_2):
            msg_a = 'the taxon lists for subtree %s ' % tree_name
            msg_b = 'are not disjoint'
            raise HandlingError(msg_a + msg_b)
        if set(taxa_1) | set(taxa_2) < tip_names:
            msg_a = 'a tip in subtree %s ' % tree_name
            msg_b = 'is not represented in either of the groups'
            raise HandlingError(msg_a + msg_b)
    # define the response
    out = StringIO()
    # get the results for the first method
    do_first_method(subtree_a, subtree_b, taxa_a1, taxa_a2,
            taxa_b1, taxa_b2, connecting_branch_length, out)
    # define the entire tree by connecting the subtrees
    subtree_b.get_root().set_branch_length(connecting_branch_length)
    subtree_a.get_root().add_child(subtree_b.get_root())
    tree = subtree_a
    # define the order and structure of the distance matrix
    block_structure = get_block_structure(taxa_a1, taxa_a2, taxa_b1, taxa_b2)
    name_order = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2
    # get the distance matrix
    fel_tree = NewickIO.parse(NewickIO.get_newick_string(tree),
            FelTree.NewickTree)
    D = fel_tree.get_distance_matrix(name_order)
    # get the R matrix
    R = Clustering.get_R_balaji(D)
    # get the sums of block elements of R
    block_R = [[0]*4 for i in range(4)]
    for i, block_i in enumerate(block_structure):
        for j, block_j in enumerate(block_structure):
            block_R[block_i][block_j] += R[i][j]
    # show the results from the second method
    do_second_method(fel_tree, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out)
    # show the results from the third method
    tree_m3_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree)
    tree_m3_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree)
    for t in (tree_m3_a, tree_m3_b):
        neo = Newick.NewickNode()
        neo.name = 'special'
        neo.blen = connecting_branch_length / 2
        t.get_root().add_child(neo)
    feltree_m3_a = NewickIO.parse(NewickIO.get_newick_string(tree_m3_a),
            FelTree.NewickTree)
    feltree_m3_b = NewickIO.parse(NewickIO.get_newick_string(tree_m3_b),
            FelTree.NewickTree)
    tree_m3_a = NewickIO.parse(fs.subtree_a, Newick.NewickTree)
    tree_m3_b = NewickIO.parse(fs.subtree_b, Newick.NewickTree)
    new_root = Newick.NewickNode()
    tree_m3_a.get_root().blen = connecting_branch_length / 2
    tree_m3_b.get_root().blen = connecting_branch_length / 2
    new_root.add_child(tree_m3_a.get_root())
    new_root.add_child(tree_m3_b.get_root())
    tree_m3 = Newick.NewickTree(new_root)
    feltree_m3 = NewickIO.parse(NewickIO.get_newick_string(tree_m3),
            FelTree.NewickTree)
    branch_d2 = connecting_branch_length / 2
    do_third_method(feltree_m3_a, feltree_m3_b, feltree_m3,
            branch_d2, taxa_a1, taxa_a2, taxa_b1, taxa_b2, out)
    # show the expected results
    print >> out, 'M:'
    print >> out, MatrixUtil.m_to_string(R)
    print >> out, 'M summed within blocks:'
    print >> out, MatrixUtil.m_to_string(block_R)
    # return the response
    return out.getvalue()
Ejemplo n.º 53
0
def _Go():
    global flag;global num_days;global length;global steps;
    global num_days;global f;global sa;global plots;global curr_pos

    sel_task = task.get()
   
    if (data_path == None):
        display_error(1)
    elif (sel_task == 'None'):
        display_error(2)
    elif (sel_task == 'Summarize'):
        curr_pos = 0
        plots = []
        name = str(stock.get())
        if name != "None":
            lot = length.get()
            if lot=="":
                display_error(3)
            else:
                lot = int(lot)
                if type(lot) != int:
                    display_error(4)
                elif lot <= 0:
                    display_error(5)
                    
            if lot != "" and type(int(lot)) == int and int(lot) > 0:
                data = LoadTimeseries.read_p_timeseries(data_path, name, lot)
                original, summarized = Summarize.summarize(data)
                plt.clf()

                #yearsFmt = mdates.DateFormatter('%b %Y')
                f = plt.figure(figsize=(6,6), dpi=100, facecolor='white')
                plt.subplot(411)
                plt.title("Initial Timeserie")
                original[name].ix[original[name].index].plot(style='b')
                
##                print(summarized[name]["extremas-x"])
##                tps = []
##                tps_ind = []
##                for i in range(len(original[name])):
##                    if i in summarized[name]["extremas-x"]:
##                        tps.append(original[name][i])
##                        tps_ind.append(original[name].index[i])
##                print(tps_ind)
##                print(tps)
##                f = Series(tps, index=tps_ind).ix[tps_ind].plot(style='r',marker='o', linestyle="", markersize=7)

                plt.subplot(412)
                plt.title("Overall Timeserie Trend")
                plt.plot(summarized[name]["trend"]["x"],summarized[name]["trend"]["r"],'g')
                plt.xlim([0,lot])
                
                plt.subplot(413)
                plt.title("Turning Points")
                plt.plot(summarized[name]["extremas-x"], summarized[name]["extremas"], 'o', mfc='none', markersize=7)
                plt.xlim([0,lot])
                
                plt.subplot(414)
                plt.title("Seasonality and Cycle")
                plt.plot(summarized[name]["Ds-x"], summarized[name]["Ds"], 'r')
                plt.xlim([0,lot])
                plt.tight_layout()
                canvas.figure = f
                canvas.draw()
                
        else:
            display_error(6)
        
        
    elif sel_task == 'Clustering':
        n = num.get()
        lot = length.get()
        
        if n == "":
            display_error(7)
        else:
            n = int(n)
            if type(n) != int:
                display_error(8)
            elif n <= 0:
                display_error(9)

        if lot == "":
            display_error(10)
        else:
            lot = int(lot)
            if type(lot) != int:
                display_error(11)
            elif lot <= 0:
                display_error(12)
            
        if type(lot) == int and type(n) == int and lot > 0 and n > 0:
            curr_pos = 0
            plots = []        
            data, aDates = LoadTimeseries.read_c_timeseries(data_path, n, lot)
            original, summarized = Summarize4Clustering.summarize(data, aDates, lot)
            plt.clf()
            curr_pos = 0
            plots = Clustering.cluster(original, summarized, n)
            canvas.figure = plots[0]
            canvas.draw()
       
    elif sel_task == 'Prediction':
        curr_pos = 0
        plots = []
        name = str(stock.get())
        if name != "None":
            lot = length.get()
            sp = steps.get()
            if sp == "":
                display_error(13)
            else:
                sp = int(sp)
                if type(sp) != int:
                    display_error(14)
                elif sp <= 0:
                    display_error(15)
                
            if lot != "":
                lot = int(lot)
                if type(lot) != int:
                    display_error(11)
                elif lot <= 0:
                    display_error(12)
                
            if lot+sp > num_days:
                display_error(16)

            if type(lot) == int and lot > 0 and type(sp) == int and sp > 0 and lot+sp<=num_days:
                data = LoadTimeseries.read_p_timeseries(data_path, name, lot+sp)
                original, summarized = Summarize4Prediction.summarize(data,sp)
                orig_pre, orig_r_pre, sum_pre = Prediction.predict(original, summarized[name]["Ds"], summarized[name]["AL"], sp)
                plt.clf()
               # print(original)
                f = plt.figure(figsize=(6,6), dpi=100, facecolor='white')
                original[name].ix[original[name].index[0]:].plot(style='b', label="Actual")
                orig_pre.ix[orig_pre.index[0]:].plot(style='r',label="ARMA")
                orig_r_pre.ix[orig_r_pre.index[0]:].plot(style='purple',label="r-ARMA")
                sum_pre.ix[orig_pre.index[0]:].plot(style='g', label="Summarized")
                plt.legend(loc=2,prop={'size':10})
                canvas.figure = f
                canvas.draw()
        else:
            display_error(3)
Ejemplo n.º 54
0
def do_third_method(tree_a, tree_b, tree, branch_d2,
        taxa_a1, taxa_a2, taxa_b1, taxa_b2, out):
    print >> out, 'third method:'
    # get the covariance matrices of the mini trees
    ordered_names = taxa_a1 + taxa_a2 + taxa_b1 + taxa_b2
    ordered_names_a = taxa_a1 + taxa_a2 + ['special']
    ordered_names_b = taxa_b1 + taxa_b2 + ['special']
    block_structure = get_block_structure(taxa_a1, taxa_a2, taxa_b1, taxa_b2)
    block_structure_a = [0]*len(taxa_a1) + [1]*len(taxa_a2) + [2]
    block_structure_b = [0]*len(taxa_b1) + [1]*len(taxa_b2) + [2]
    cov = np.array(tree.get_covariance_matrix(ordered_names))
    cov_a = np.array(tree_a.get_covariance_matrix(ordered_names_a))
    cov_b = np.array(tree_b.get_covariance_matrix(ordered_names_b))
    prec_a = np.linalg.inv(cov_a)
    block_prec_a = [[0]*3 for i in range(3)]
    for i, block_i in enumerate(block_structure_a):
        for j, block_j in enumerate(block_structure_a):
            block_prec_a[block_i][block_j] += prec_a[i][j]
    prec_b = np.linalg.inv(cov_b)
    block_prec_b = [[0]*3 for i in range(3)]
    for i, block_i in enumerate(block_structure_b):
        for j, block_j in enumerate(block_structure_b):
            block_prec_b[block_i][block_j] += prec_b[i][j]
    a = block_prec_a[0][0]
    b = block_prec_a[0][1]
    d = block_prec_a[1][1]
    e = block_prec_b[0][0]
    f = block_prec_b[0][1]
    h = block_prec_b[1][1]
    x = branch_d2
    # make the block M matrix using a clever formula
    Q_a = [
            [a, b, 0, 0],
            [b, d, 0, 0],
            [0, 0, e, f],
            [0, 0, f, h]]
    den_a = (a + 2*b + d + 1/x)
    den_b = (e + 2*f + h + 1/x)
    Q_b = [
            [(a+b)*(a+b)/den_a, (a+b)*(b+d)/den_a, 0, 0],
            [(b+d)*(a+b)/den_a, (b+d)*(b+d)/den_a, 0, 0],
            [0, 0, (e+f)*(e+f)/den_b, (e+f)*(f+h)/den_b],
            [0, 0, (f+h)*(e+f)/den_b, (f+h)*(f+h)/den_b]]
    glom = a+2*b+d+e+2*f+h+2*x*(a+2*b+d)*(e+2*f+h)
    den_a2 = (den_a / den_b) * glom
    den_b2 = (den_b / den_a) * glom
    Q_c = [
            [(a+b)*(a+b)/den_a2, (a+b)*(b+d)/den_a2,
                (a+b)*(e+f)/glom, (a+b)*(f+h)/glom],
            [(b+d)*(a+b)/den_a2, (b+d)*(b+d)/den_a2,
                (b+d)*(e+f)/glom, (b+d)*(f+h)/glom],
            [(e+f)*(a+b)/glom, (e+f)*(b+d)/glom,
                (e+f)*(e+f)/den_b2, (e+f)*(f+h)/den_b2],
            [(f+h)*(a+b)/glom, (f+h)*(b+d)/glom,
                (f+h)*(e+f)/den_b2, (f+h)*(f+h)/den_b2]]
    Q = np.array(Q_a) - np.array(Q_b) - np.array(Q_c)
    print >> out, 'cleverly constructed block M:'
    print >> out, MatrixUtil.m_to_string(Q)
    # make the equivalent tree
    a_star = (b+d)/(a*d-b*b)
    b_star = (a+b)/(a*d-b*b)
    c_star = (f+h)/(e*h-f*f)
    d_star = (e+f)/(e*h-f*f)
    e_star = 2*x - b/(a*d-b*b) - f/(e*h-f*f)
    print >> out, 'using the block precision matrix:'
    print >> out, 'equivalent subtree A:', '(a1:%f, a2:%f);' % (a_star, b_star)
    print >> out, 'equivalent subtree B:', '(b1:%f, b2:%f);' % (c_star, d_star)
    print >> out, 'equivalent connecting branch length:', e_star
    # make the block M matrix using Eric's formula (corrected)
    A, B, C, D, E = a_star, b_star, c_star, d_star, e_star
    H = A*B*(C+D) + C*D*(A+B) + E*(C+D)*(A+B)
    Q = [[1/H]*4 for i in range(4)]
    Q[0][0] *= B*D + B*C + C*D + E*(C+D)
    Q[0][1] *= -C*D - E*(C+D)
    Q[0][2] *= -B*D
    Q[0][3] *= -B*C
    Q[1][0] *= -C*D - E*(C+D)
    Q[1][1] *= A*D + A*C + C*D + E*(C+D)
    Q[1][2] *= -A*D
    Q[1][3] *= -A*C
    Q[2][0] *= -B*D
    Q[2][1] *= -A*D
    Q[2][2] *= A*B + B*D + A*D + E*(A+B)
    Q[2][3] *= -A*B - E*(A+B)
    Q[3][0] *= -B*C
    Q[3][1] *= -A*C
    Q[3][2] *= -A*B - E*(A+B)
    Q[3][3] *= A*B + B*C + A*C + E*(A+B)
    print >> out, 'reconstructed block M:'
    print >> out, MatrixUtil.m_to_string(Q)
    M = Clustering.get_R_balaji(cov)
    M_a = Clustering.get_R_balaji(cov_a)
    M_b = Clustering.get_R_balaji(cov_b)
    block_M = [[0]*4 for i in range(4)]
    for i, block_i in enumerate(block_structure):
        for j, block_j in enumerate(block_structure):
            block_M[block_i][block_j] += M[i][j]
    block_M_a = [[0]*3 for i in range(3)]
    for i, block_i in enumerate(block_structure_a):
        for j, block_j in enumerate(block_structure_a):
            block_M_a[block_i][block_j] += M_a[i][j]
    block_M_b = [[0]*3 for i in range(3)]
    for i, block_i in enumerate(block_structure_b):
        for j, block_j in enumerate(block_structure_b):
            block_M_b[block_i][block_j] += M_b[i][j]
    c_1 = block_M_a[0][1]
    c_2 = block_M_a[0][2]
    c_3 = block_M_a[1][2]
    denominator = (c_1*c_2) + (c_2*c_3) + (c_3*c_1)
    a_star = -c_3 / denominator
    b_star = -c_2 / denominator
    e_star_a = -c_1 / denominator
    c_1 = block_M_b[0][1]
    c_2 = block_M_b[0][2]
    c_3 = block_M_b[1][2]
    denominator = (c_1*c_2) + (c_2*c_3) + (c_3*c_1)
    c_star = -c_3 / denominator
    d_star = -c_2 / denominator
    e_star_b = -c_1 / denominator
    e_star = e_star_a + e_star_b
    print >> out, 'using the block M matrix:'
    print >> out, 'equivalent subtree A:', '(a1:%f, a2:%f);' % (a_star, b_star)
    print >> out, 'equivalent subtree B:', '(b1:%f, b2:%f);' % (c_star, d_star)
    print >> out, 'equivalent connecting branch length:', e_star
    print >> out, 'calculated block M:'
    print >> out, MatrixUtil.m_to_string(block_M)
    print >> out
Ejemplo n.º 55
0
	                   help='a singe filename to read from (can be compiled using feedvector)')
	parser.add_argument("--draw", "-d",
						help='this outputs a dendrogram jpg for hclusters and a 2d map for kclusters', action="store_true")
	parser.add_argument("--rotate", "-r",
						help='flip the rows and colums', action="store_true")
	parser.add_argument("--kcluster", "-k",
					    help="uses a kcluster instead of an hcluster", action="store_true")
	parser.add_argument("--scaledown", "-s",
						help="uses scaling algorithm for clustering display", action="store_true")

	args = parser.parse_args()

	blognames, words, data = readfile(str(args.file))

	if args.rotate:
		data = Clustering.rotatematrix(data)
		print "data rotated"

	if args.kcluster:
		clust = Clustering.hcluster(data)
	else:
		clust = Clustering.hcluster(data)

	if args.rotate:
		Clustering.printclust(clust, labels=words)
	else:
		Clustering.printclust(clust, labels=blognames)

	if(args.draw):
		if args.kcluster:
			print repr(kcluster)
Ejemplo n.º 56
0
Archivo: TRAP.py Proyecto: keialk/TRAP
def main() :
	# Start of TRAP
	sys.stdout.write("\n###### Pathway and clustering analysis ######\n\n")
	sys.stdout.flush()

	cuffPath = "cufflinks_result"
	diffPath = "cuffdiff_result"
	resultPath = "TRAP_result"

	controlList = []
	caseList = []
	diffList = []
	timeLen = 0
	geneIDPath = ""
	pnamePath = ""
	kgmlPath = ""
	xmlPath = ""
	cuffdiff= ""
	pCut = 0.05
	DEGCut = 2.0
	clusterCut = 2.0
	timeLag = 1.0

	fcList = {}	# fcList[geneID]=[fc_0, ... , fc_t]
	pVal = {} 	# pVal[geneID]=[p_0, ..., p_t]
	idDic = {}	# idDic[keggID]=geneID
	pnameDic = {}	# pnameDic[pID]=pathwayName
	 

	# Reading configuration file
	sys.stdout.write("Reading configuration file\t......\t")
	sys.stdout.flush()

	try :
		config = open("config.txt", "r")
		while True :
			cl = config.readline()
			if cl=="" :
				break
			tp = cl.split("=")
			if (len(tp)<2) :
				continue
			key=tp[0].strip()
			val=tp[1].strip()
			if (key[:7]=="control") :
				controlList.append(val.split(','))	
			elif (key[:9]=="treatment") :
				caseList.append(val.split(','))
			elif (key=="numTP") :
				timeLen=int(val)	
			elif (key=="convfilePath") :
				geneIDPath = val
			elif (key=="pnamePath") :
				pnamePath = val
			elif (key=="kgmlPath") :
				kgmlPath = val
			elif (key=="cuffdiff") :
				cuffdiff = val
			elif (key=="pVal") :
				pCut = float(val)
			elif (key[:4]=="diff") :
				diffList.append(val)
			elif (key=="DEGCut") :
				DEGCut = float(val)
			elif (key=="clusterCut") :
				clusterCut = float(val)
			elif (key=="timeLag") :
				timeLag = float(val)
			else :
				continue
		idFile = open(geneIDPath, "r")
		pnameFile = open(pnamePath, "r")
		xmlPath = os.walk(kgmlPath)

		if (cuffdiff=="no" and len(controlList)!=len(caseList)) :
			raise

	except IOError:
		print "Check if the configuration file exists"
	except :
		print "Configuration file error"
		raise
        # Make sure result path exists
        try: 
            os.makedirs(resultPath)
        except OSError:
            if not os.path.isdir(resultPath):
                raise
        # Copy config file so we don't get confused of what params have been set
        copy("config.txt", resultPath)

	# Reading ID-conversion / pathway name file
	for ids in idFile.readlines() :
		tp = ids.split("\t")
		tp2 = tp[1].split(";")
		tp3 = tp2[0].split(", ")
		if tp[0] in idDic :
			for name in tp3 :
				idDic[tp[0]].append(name.strip())
		else : 
			idDic[tp[0]]=[]
			for name in tp3 :
				idDic[tp[0]].append(name.strip())
	idFile.close()

	for path in pnameFile.readlines() :
		tp = path.split("\t")
		tp2 = tp[0].split(":")
		tp3 = tp[1].split(" - ")
		pnameDic[tp2[1]]=tp3[0]
	pnameFile.close()

	sys.stdout.write("Done\n")
	sys.stdout.flush()

	# Reading fpkm file
	sys.stdout.write("Reading expression files\t......\t")
	sys.stdout.flush()

	geneSum = set()
	if cuffdiff=="yes" :
                for j in range(timeLen) :
                        pfile = open(os.path.join(diffPath, diffList[j], "gene_exp.diff"), "r")
                        for l in pfile.readlines() :
                                if l.startswith('#'):
                                        continue
                                tp = l.split()
				if not is_number(tp[9]) :
					continue
                                geneSum.add(tp[2])
                        pfile.close()
		for gene in geneSum :
			fcList[gene]=[]
                        pVal[gene]=[]
                for j in range(timeLen) :
                        pfile = open(os.path.join(diffPath, diffList[j], "gene_exp.diff"), "r")
                        temp = {}
                        temp2 = {}
                        for l in pfile.readlines() :
                                if l.startswith('#'):
                                        continue

                                tp = l.split()
				if not is_number(tp[9]) :
					continue
				if ( tp[9]=='inf' or tp[9] == '-inf') :
					temp[tp[2]]=0
				else: 
	                                temp[tp[2]]=float(tp[9])
                                temp2[tp[2]]=float(tp[12])

                        for gene in geneSum :
                                if gene in temp :
                                        fcList[gene].append(temp[gene])
                                        pVal[gene].append(temp2[gene])
                                else :
                                        fcList[gene].append(0)
                                        pVal[gene].append(1)
                        pfile.close()

	else : 
                for j in range(timeLen) :
			for con in controlList[j] : 
	                        pfile = open(os.path.join(cuffPath, con, "genes.fpkm_tracking"), "r")
	                        for l in pfile.readlines() :
        	                        tp = l.split()
                	                if not is_number(tp[9]) :
                        	                continue
                                	geneSum.add(tp[4])
	                        pfile.close()
                        for case in caseList[j] :
                                pfile = open(os.path.join(cuffPath, case, "genes.fpkm_tracking"), "r")
                                for l in pfile.readlines() :
                                        tp = l.split()
                                        if not is_number(tp[9]) :
                                                continue
                                        geneSum.add(tp[4])
                                pfile.close()
                for gene in geneSum :
                        fcList[gene]=[]
		for j in range(timeLen) :
			temp1 = {}
			temp2 = {}
			for con in controlList[j] :
                                pfile = open(os.path.join(cuffPath, con, "genes.fpkm_tracking"), "r")
                                for l in pfile.readlines() :
					tp = l.split()
					if (tp[9]=="FPKM") :
						continue
					if tp[4] in temp1 : 
						temp1[tp[4]].append(float(tp[9]))
					else : 
						temp1[tp[4]]=[float(tp[9])]
				pfile.close()
			for case in caseList[j] :
                                pfile = open(os.path.join(cuffPath, case, "genes.fpkm_tracking"), "r")
				for l in pfile.readlines() :
					tp = l.split()
					if (tp[9]=="FPKM") :
						continue
					if tp[4] in temp2 :
                                                temp2[tp[4]].append(float(tp[9]))
                                        else :
                                                temp2[tp[4]]=[float(tp[9])]
				pfile.close()
			
			for gene in geneSum :
				med1 = 0
				med2 = 0
				if gene in temp1 and gene in temp2 :
					med1 = median(temp1[gene])
                                        med2 = median(temp2[gene])
				elif gene in temp1 :
                                        med1 = median(temp1[gene])
				elif gene in temp2 :
                                        med2 = median(temp2[gene])
				else :
					med1 = med1
					med2 = med2
				
				if (abs(med2-med1)<1.0) : 
					fcList[gene].append(0)
				else :
					fcList[gene].append(math.log((med2+0.01)/(med1+0.01),2))

	sys.stdout.write("Done\n")
	sys.stdout.flush()

	# Parsing xml file to get gene and relation information
	sys.stdout.write("Reading xml files\t\t......\t")
	sys.stdout.flush()

	i=0
	ind = {}
	DEG = []
	wgene = []
	wredic = []
	empty = []
	empty2 = []

	for t in range(0, timeLen) :
		wgene.append([])	#wgene[t][i]={keggID:fc}
		DEG.append([])		#DEG[t][i]=set(keggID)
		empty.append(0)
		empty2.append(1)

	for root,dirs,files in xmlPath:
	   for file in files:
		filetp = file.split(".")
		ind[filetp[0]]=i
		for j in range(0, timeLen) :
			wgene[j].append({})
			DEG[j].append(set())
		wredic.append({})	#wredic[i]={keggID:(list of [asc, length, j])}

		xmlfile = open(os.path.join(kgmlPath, file), "r")
		xmldata = xmlfile.read()
		dom = parseString(xmldata)
		xmlfile.close()

		geneSet = set()
		entrydic = {}
		entries = dom.getElementsByTagName("entry")
		for e in entries :
			if (e.attributes.getNamedItem("type").nodeValue == 'gene') :
				id = e.attributes.getNamedItem("id").nodeValue
				id = str(id)
				genes = e.attributes.getNamedItem("name").nodeValue
				genes = str(genes)
				genelist = genes.split()
				entrydic[id]=[]
				for g in genelist : 
					entrydic[id].append(g)
					geneSet.add(g)
			elif (e.attributes.getNamedItem("type").nodeValue == 'group') :
				id = e.attributes.getNamedItem("id").nodeValue
				id = str(id)
				comps = e.getElementsByTagName("component")
				entrydic[id]=[]
				for c in comps :
					geneId =c.attributes.getNamedItem("id").nodeValue
					for g in entrydic[geneId] :
						entrydic[id].append(g)
						geneSet.add(g)
		for g in geneSet :
			if (g in idDic) :
				nameExist = 0
				tpName = ""
				for name in idDic[g] :
					if name in fcList.keys() :
						nameExist = 1
						tpName = name
						break
				if nameExist==1 :
					for t in range(0, timeLen) :
						foldchange = fcList[tpName][t]
						wgene[t][i][g]=foldchange
						if (cuffdiff=="yes" and pVal[tpName][t]<=pCut and abs(foldchange)>=DEGCut) :
							DEG[t][i].add(g)
						elif (cuffdiff=="no" and abs(foldchange)>=DEGCut) :
							DEG[t][i].add(g)
				else :
					for t in range(0, timeLen) :
						wgene[t][i][g]=0
						fcList[idDic[g][0]]=empty
						if (cuffdiff=="yes") :	
							pVal[idDic[g][0]]=empty2
			else :
				for t in range(0, timeLen) :
					wgene[t][i][g]=0
					fcList[g]=empty
					if (cuffdiff=="yes") :
						pVal[g]=empty2

		redic = wredic[i]
		relations = dom.getElementsByTagName("relation")
		for r in relations :
			subs = r.getElementsByTagName("subtype")
			ent1 = r.attributes.getNamedItem("entry1").nodeValue
			ent2 = r.attributes.getNamedItem("entry2").nodeValue
			if (not (subs==[])) :
				for s in subs :
					type = s.attributes.getNamedItem("name").nodeValue
					if (type=="activation" or type=="expression") :
						j=1
					elif (type=="inhibition" or type=="repression") :
						j=-1
					else :
						j=0
					if (j!=0 and (ent1!=ent2) and (ent1 in entrydic) and (ent2 in entrydic)) :
						for desc in entrydic[ent2] :
							length = len(entrydic[ent2])
							for asc in entrydic[ent1] :
								if (desc in redic) :
									redic[desc].append([asc, length, j])
								else :
									redic[desc]=[[asc, length, j]]
		i=i+1

	fileN = i
	sys.stdout.write("Done\n")
	sys.stdout.flush()


	# 1. One time point SPIA analysis
	sys.stdout.write("One time point SPIA analysis\n")
	sys.stdout.flush()

	for t in range(0, timeLen) :
		sys.stdout.write("\t"+str(t+1)+"th time point\t......\t")
		sys.stdout.flush()

		OT.pathwayAnalysis(os.path.join(resultPath, "OneTime_"+str(t+1)), fileN, wgene[t], wredic, DEG[t], DEGCut,  idDic, pnameDic, ind)

		sys.stdout.write("Done\n")
		sys.stdout.flush()


	# 2. Time-series SPIA analysis
	sys.stdout.write("Time-series SPIA analysis\t......\t")
	sys.stdout.flush()

	TS.pathwayAnalysis(os.path.join(resultPath, "TimeSeries"), wgene, wredic, DEG, idDic, pnameDic, timeLag, timeLen, ind, fcList)

	sys.stdout.write("Done\n")
	sys.stdout.flush()

	# 3. Clustering analysis
	sys.stdout.write("Clustering Analysis\t\t......\t")
	sys.stdout.flush()

	CL.clusteringAnalysis(os.path.join(resultPath, "Clustering"), wgene, fcList, pVal, idDic, pnameDic, clusterCut, pCut, timeLen, ind, cuffdiff)

	sys.stdout.write("Done\n")
	sys.stdout.flush()
def create_gaussian_clusters_and_map(restaurants, restaurant_ids_to_topics, my_map, lda, use_human_labels=True):
    data = Clustering.create_data_array(restaurants, restaurant_ids_to_topics, my_map)
    Clustering.plot_gaussian_clusters(my_map, restaurants, restaurant_ids_to_topics, data, lda)