Esempio n. 1
0
def get_cv_cpv(x: str, percent: float) -> float:
    global model_goal
    # Get dataset number
    dataset_num = get_dataset_num(x)

    # Get number of pcs for CPV > 0.8 and CPV > 0.99
    if percent == 0.99:
        pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.99)"]
    else:
        pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.8)"]

    # Get df_results
    df = pd.read_csv(x)
    idx = df.features_kept == pcs_cpv
    try:
        return df.loc[idx].cv.values[0]
    except:
        inputs = Inputs(paths)
        inputs.random_seed = 1969
        inputs.get_df_split(dataset_num)

        pca_model = get_pca_model(inputs)

        cluster_model = Clustering(inputs.num_cluster, 100, inputs.random_seed)
        cluster_model.fit(pca_model.pcs_train.loc[:, :pcs_cpv - 1])
        cluster_prediction = cluster_model.predict(
            pca_model.pcs_test.loc[:, :pcs_cpv - 1])
        cluster_performances = cluster_model.get_cluster_performances(
            inputs.df_test.copy(),
            cluster_prediction,
            pcs_cpv,
            inputs.num_cluster,
            model_goal=model_goal)
        return variation(cluster_performances)
Esempio n. 2
0
 def cluster(self, shapelets):
     """
     Uses a clustering algorithm to reduce the number of shapelets.
     :param shapelets: list of shapelet candidates
     :type shapelets: np.array, shape = (len(shapelets), len(s), len(dim(s)))
     :return: list of remaining shapelet candidates
     :rtype np.array, shape = (|remaining candidates|, len(s), len(dim(s)))
     """
     clustering = Clustering(self.d_max)
     clustering.fit(shapelets)
     return clustering.nn_centers()
Esempio n. 3
0
def perform_clustering(
        term_ids_to_embs: Dict[int, List[float]]) -> Dict[int, Set[int]]:
    """Cluster the given terms into 5 clusters.

    Args:
        term_ids_to_embs: A dictionary mapping term-ids to their
            embeddings.
    Return:
        A dictionary of mapping each cluster label to its cluster.
        Each cluster is a set of term-ids.
    """
    # Case less than 5 terms to cluster.
    num_terms = len(term_ids_to_embs)
    if num_terms < 5:
        clusters = {}
        for i, tid in enumerate(term_ids_to_embs):
            clusters[i] = {tid}
        return clusters

    # Case more than 5 terms to cluster.
    c = Clustering()
    term_ids_embs_items = [(k, v) for k, v in term_ids_to_embs.items()]
    results = c.fit([it[1] for it in term_ids_embs_items])
    labels = results['labels']
    print('  Density:', results['density'])
    clusters = defaultdict(set)
    for i in range(len(term_ids_embs_items)):
        term_id = term_ids_embs_items[i][0]
        label = labels[i]
        clusters[label].add(term_id)
    return clusters
Esempio n. 4
0
def clustering(x, df, n_clusters=10, distance='angular', method='K-medians'):
    """
  Do the clustering, based on the 91 features.
  Args:
	  x: array of features
	  df: dataframe of features
	  n_clusters: number of clusters
	  distance: could be 'angular' or 'euclidean';
      method: could be 'K-medians', 'K-means', 'Hierarchical'
  Output:
	  new_df: the labeled dataframe, according to the clustering algorithm
	  relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids
	  cs: dictionary with the centroid features 
  """

    relevant_features_id = [
        0, 3, 5, 13, 15, 17, 25, 46, 47, 56, 64, 65, 76, 77, 83, 85, 90
    ]
    keys_dict = [
        '0-1', '0-4', '0-6', '1-2', '1-4', '1-6', '2-3', '4-5', '4-6', '5-7',
        '6-8', '6-9', '8-9', '8-10', '9-12', '10-11', '12-13'
    ]

    clustering_ = Clustering(k=n_clusters, distance=distance, method=method)
    cs, cls = clustering_.fit(x)

    assert len(list(cls.keys())) == n_clusters

    d = pd.DataFrame()
    l = []
    for i in range(n_clusters):
        df1 = pd.DataFrame(cls[i])
        d = pd.concat([d, df1], sort=False)
        l += [i] * len(cls[i])

    d.columns = df.columns
    d.insert(91, 'label', l)

    new_df = df.reset_index().merge(d).set_index('index')

    relevant_features_cs = []
    if method == 'Hierarchical':
        pass
    else:
        for i in range(len(cs)):
            d = {}
            cs_rf = cs[i][relevant_features_id]
            for k in range(len(keys_dict)):
                d[keys_dict[k]] = cs_rf[k]
            relevant_features_cs.append(d)

    return new_df, relevant_features_cs, cs
Esempio n. 5
0
def clustering(x, n_clusters):
    """
  Do the clustering, based on the 91 features. 
  We compute the reconstructed poses only with the following default parameters:
    method: 'K-Medians'
    distance: 'angular'
  Args:
    x: array of features
    n_clusters: number of clusters
  Output:
    new_df: the labeled dataframe, according to the clustering algorithm
    relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids
    cs: dictionary with the centroid features 
  """

    clustering_ = Clustering(k=n_clusters)
    cs, cls = clustering_.fit(x)
    d = pd.DataFrame()
    l = []
    for i in range(len(cs)):
        df1 = pd.DataFrame(cls[i])
        d = pd.concat([d, df1], sort=False)
        l += [i] * len(cls[i])

    d.columns = df.columns
    d.insert(91, 'label', l)

    new_df = df.reset_index().merge(d).set_index('index')

    assert len(cs) == n_clusters

    relevant_features_cs = []
    for i in range(len(cs)):
        d = {}
        cs_rf = cs[i][relevant_features_id]
        for k in range(len(keys_dict)):
            d[keys_dict[k]] = cs_rf[k]
        relevant_features_cs.append(d)

    return new_df, relevant_features_cs, cs
Esempio n. 6
0
from clustering import Clustering
# from embeddings import *

words = [
    'computer', 'algorithm', 'program', 'bear', 'cat', 'snake', 'fish', 'tree',
    'flower', 'gras', 'tea', 'water', 'milk'
]

# embedder = FastTextE()
# embedder.load_model()
emb_dict = {}
with open('fasttext-wiki-news-300d-1M.vec', 'r', encoding='utf8') as f:
    for line in f:
        sp_line = line.split(' ')
        token, vector = sp_line[0], sp_line[1:]
        emb_dict[token] = vector
word_embeddings = [emb_dict[word] for word in words]
clus = Clustering()
print(clus)
print(clus.clus_type)
print(clus.affinity)
print(clus.fit(word_embeddings))