Beispiel #1
0
def clustering(entities, model, ratio, seed, verbose):
    # Cluster embeddings (on the original space)
    n_entities = len(entities)
    if verbose:
        print('Considering ' + str(n_entities) +
              ' entities from triple file...')

    n_clusters = math.ceil(n_entities * ratio / 100)
    if verbose:
        print('Clustering with n_clusters = ' + str(n_clusters))

    clustering_algorithm = KMeans(n_clusters=n_clusters,
                                  n_init=50,
                                  max_iter=500,
                                  random_state=seed)
    clusters = find_clusters(entities,
                             model,
                             clustering_algorithm,
                             mode='entity')

    #pickle_cluster = "./temp/cluster_"+ str(ratio) +".pickle"
    #with open(pickle_cluster, "wb") as fp:   #Pickling
    #    pickle.dump(clusters, fp)

    return clusters
Beispiel #2
0
def clustering(entities, model, rate, verbose):
    # Cluster embeddings (on the original space)
    n_entities = len(entities)
    if verbose:
        print('Considering ' + str(n_entities) + ' entities from triple file...')
    
    n_clusters = math.ceil(n_entities*rate/100)
    if verbose:
        print('Clustering with n_clusters = '+str(n_clusters))

    clustering_algorithm = KMeans(n_clusters=n_clusters, n_init=10, max_iter=300, random_state=0)
    clusters = find_clusters(entities, model, clustering_algorithm, mode='entity')
    return clusters
Beispiel #3
0
# Get the teams entities and their corresponding embeddings
triples_df = pd.DataFrame(X, columns=['s', 'p', 'o'])
teams = triples_df.s[triples_df.s.str.startswith('Team')].unique()
team_embeddings = dict(zip(teams, model.get_embeddings(teams)))
team_embeddings_array = np.array([i for i in team_embeddings.values()])

# Project embeddings into 2D space via PCA
embeddings_2d = PCA(n_components=2).fit_transform(team_embeddings_array)

# Cluster embeddings (on the original space)
clustering_algorithm = KMeans(n_clusters=6,
                              n_init=100,
                              max_iter=500,
                              random_state=0)
clusters = find_clusters(teams, model, clustering_algorithm, mode='entity')


# This function maps country to continent
def cn_to_ctn(country):
    try:
        original_name = ' '.join(re.findall('[A-Z][^A-Z]*', country[4:]))
        return transformations.cn_to_ctn(original_name)
    except KeyError:
        return "unk"


plot_df = pd.DataFrame({
    "teams": teams,
    "embedding1": embeddings_2d[:, 0],
    "embedding2": embeddings_2d[:, 1],
def clustering(entities, model):
    # Cluster embeddings (on the original space)
    n_entities = len(entities)
    print('Considering ' + str(n_entities) + ' entities from triple file...')

    # 25%
    n_clusters = math.ceil(n_entities * 0.25)
    print('Clustering with n_clusters = ' + str(n_clusters))
    clustering_algorithm = KMeans(n_clusters=n_clusters,
                                  n_init=10,
                                  max_iter=300,
                                  random_state=0)
    cluster25 = find_clusters(entities,
                              model,
                              clustering_algorithm,
                              mode='entity')
    # DF to File
    cluster_df = pd.DataFrame({
        "entities":
        entities,
        "cluster25":
        "cluster25" + pd.Series(cluster25).astype(str)
    })
    print(cluster_df['cluster25'].value_counts())
    print(cluster_df['cluster25'].value_counts().value_counts())
    cluster_df.to_csv('./temp/cluster25.tsv',
                      sep='\t',
                      header=False,
                      index=False)

    # 50%
    n_clusters = math.ceil(n_entities * 0.5)
    print('Clustering with n_clusters = ' + str(n_clusters))
    clustering_algorithm = KMeans(n_clusters=n_clusters,
                                  n_init=10,
                                  max_iter=300,
                                  random_state=0)
    cluster50 = find_clusters(entities,
                              model,
                              clustering_algorithm,
                              mode='entity')
    # DF to File
    cluster_df = pd.DataFrame({
        "entities":
        entities,
        "cluster50":
        "cluster50" + pd.Series(cluster50).astype(str)
    })
    print(cluster_df['cluster50'].value_counts())
    print(cluster_df['cluster50'].value_counts().value_counts())
    cluster_df.to_csv('./temp/cluster50.tsv',
                      sep='\t',
                      header=False,
                      index=False)

    # 75%
    n_clusters = math.ceil(n_entities * 0.75)
    print('Clustering with n_clusters = ' + str(n_clusters))
    clustering_algorithm = KMeans(n_clusters=n_clusters,
                                  n_init=10,
                                  max_iter=300,
                                  random_state=0)
    cluster75 = find_clusters(entities,
                              model,
                              clustering_algorithm,
                              mode='entity')
    # DF to File
    cluster_df = pd.DataFrame({
        "entities":
        entities,
        "cluster75":
        "cluster75" + pd.Series(cluster75).astype(str)
    })
    print(cluster_df['cluster75'].value_counts())
    print(cluster_df['cluster75'].value_counts().value_counts())
    cluster_df.to_csv('./temp/cluster75.tsv',
                      sep='\t',
                      header=False,
                      index=False)