def get_clusters():
    kernel = ['linear', 'cosine', 'sigmoid', 'polynomial']
    for ii in np.arange(32, 37, 1):
        for ij in kernel:
            pca = kPCA(k = ii, kernel = ij).fit(np.array(simscore))
            pca = pca.components_.T
            km = kkMeans(k = ii, kernel = ij, gamma = 1).fit_predict(pca)
            cluster_labels = km.clusters
            if not os.path.exists(os.path.join(path, 'labels')):
                os.makedirs(os.path.join(path, 'labels'))
                pd.DataFrame(cluster_labels).to_csv(os.path.join(path, f'labels/labels_{ii}_{ij}.csv'))
            else:
                pd.DataFrame(cluster_labels).to_csv(os.path.join(path, f'labels/labels_{ii}_{ij}.csv'))
Exemple #2
0
def update_figure(make_selection, g_m, knl, drop, yaxis, clust):
    #    data_places = data[(data.year_edited >= make_selection[0]) & (data.year_edited <= make_selection[1])]
    ts = pd.read_csv(os.path.join(path,
                                  f'tsne/tsne_{int(clust)}.csv')).iloc[:, 1:]
    ts = ts.sort_values(by=['year'])
    data_places = ts[(ts.year >= make_selection[0])
                     & (ts.year <= make_selection[1])]
    if g_m == 'Cluster':
        if drop != []:
            traces = []
            for val in drop:
                traces.append(go.Scattergl(
                        x = np.array(data_places.loc[data_places.year == int(val), '0']),
                        y = np.array(data_places.loc[data_places.year == int(val), '1']),
                        text = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\
                                 data_places.loc[data_places['year'] == int(val), 'pdf_names'].apply(lambda x: x.split('.')[0]),\
                                 data_places.loc[data_places['year'] == int(val), 'year'],\
                                 data_places.loc[data_places['year'] == int(val), 'language'],\
                                 data_places.loc[data_places['year'] == int(val), 'authors'],\
                                data_places.loc[data_places['year'] == int(val), 'title'])],
                        customdata = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\
                                 data_places.loc[data_places['year'] == int(val), 'pdf_names'].apply(lambda x: x.split('.')[0]),\
                                 data_places.loc[data_places['year'] == int(val), 'year'],\
                                 data_places.loc[data_places['year'] == int(val), 'language'],\
                                 data_places.loc[data_places['year'] == int(val), 'authors'],\
                                data_places.loc[data_places['year'] == int(val), 'title'])],
                        mode = 'markers',
                        opacity = 0.6,
                        marker = {'size': 15,
                                  'line': {'width': 0.5, 'color': 'white'}},
                        name = val,
                        ))

            return {
                'data':
                traces,
                'layout':
                go.Layout(xaxis={'title': 'tsne-2'},
                          yaxis={
                              'type': 'linear' if yaxis == 'Linear' else 'log',
                              'title': 'tsne-1'
                          },
                          margin={
                              'l': 40,
                              'b': 40,
                              't': 10,
                              'r': 10
                          },
                          legend={
                              'x': 1,
                              'y': 1
                          },
                          hovermode='closest')
            }
        else:
            pca = kPCA(k=int(clust), kernel=knl).fit(np.array(simscore))
            pca = pca.components_.T
            km = kkMeans(k=int(clust), kernel=knl, gamma=1).fit_predict(pca)
            cluster_labels = km.clusters
            ts = pd.read_csv(os.path.join(
                path, f'tsne/tsne_{int(clust)}.csv')).iloc[:, 1:]
            ts = ts[(ts.year >= make_selection[0])
                    & (ts.year <= make_selection[1])]
            traces = go.Scattergl(
                    x = np.array(ts)[:, 0],
                    y = np.array(ts)[:, 1],
                    text = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\
                                 ts['pdf_names'].apply(lambda x: x.split('.')[0]),\
                                 ts['year'],\
                                 ts['language'],\
                                 ts['authors'],\
                                 ts['title'])],
                    customdata = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\
                                  ts['pdf_names'].apply(lambda x: x.split('.')[0]),\
                                  ts['year'],\
                                 ts['language'],\
                                 ts['authors'],\
                                ts['title'])],
                    mode = 'markers',
                    opacity = 0.7,
                    marker = {'size': 15,
            #                          'opacity': 0.9,
                              'color': cluster_labels,
                              'colorscale':'Viridis',
                              'line': {'width': .5, 'color': 'white'}},
                    )

            return {
                'data': [traces],
                'layout':
                go.Layout(height=600,
                          xaxis={'title': 'tsne-2'},
                          yaxis={
                              'type': 'linear' if yaxis == 'Linear' else 'log',
                              'title': 'tsne-1'
                          },
                          margin={
                              'l': 40,
                              'b': 40,
                              't': 10,
                              'r': 10
                          },
                          legend={
                              'x': 1,
                              'y': 1
                          },
                          hovermode='closest')
            }
    else:
        ss = np.array(simscore)
        m, n = ss.shape
        G = nx.Graph()
        for n in range(m):
            G.add_node(n)
        for i in range(m):
            for j in range(n):
                if ss[i, j] != 0 and i != j:
                    G.add_edge(i, j)
        E = [edg for edg in G.edges]
        pos = nx.fruchterman_reingold_layout(G)
        Xv = [pos[k][0] for k in range(n)]
        Yv = [pos[k][1] for k in range(n)]
        Xed = []
        Yed = []
        for edge in E:
            Xed += [pos[edge[0]][0], pos[edge[1]][0], None]
            Yed += [pos[edge[0]][1], pos[edge[1]][1], None]

        etrace = go.Scattergl(x=Xed,
                              y=Yed,
                              mode='lines',
                              line=dict(color='rgb(210,210,210)', width=.5),
                              hoverinfo='none')

        vtrace = go.Scattergl(
            x=Xv,
            y=Yv,
            mode='markers',
            name='net',
            marker=dict(symbol='circle-dot',
                        size=5,
                        color='#6959CD',
                        line=dict(color='rgb(50,50,50)', width=0.5)),
            #                       text = labels,
            hoverinfo='text')

        return {
            'data': [etrace, vtrace],
            'layout':
            go.Layout(height=600,
                      xaxis={'title': 'year'},
                      yaxis={
                          'type': 'linear' if yaxis == 'Linear' else 'log',
                          'title': 'Similarity score'
                      },
                      margin={
                          'l': 40,
                          'b': 40,
                          't': 10,
                          'r': 10
                      },
                      legend={
                          'x': 1,
                          'y': 1
                      },
                      hovermode='closest')
        }
Exemple #3
0
        'time': [],
        'acc': [],
        'prec': [],
        'rec': [],
        'f1': [],
        'randind': []
    }
}

for p, q in data_name.items():
    for ii in kernels:
        start = time.time()
        if p == 'moon':
            gamma = 10
            d = 3
            kmeans = kkMeans(k=2, kernel=ii, gamma=gamma).fit_predict(q[0])
            kernel_outcome[ii]['acc'].append(
                kmeans.accuracy(q[1], kmeans.clusters))
            kernel_outcome[ii]['prec'].append(
                kmeans.precision(q[1], kmeans.clusters))
            kernel_outcome[ii]['rec'].append(
                kmeans.recall(q[1], kmeans.clusters))
            kernel_outcome[ii]['f1'].append(kmeans.f1(q[1], kmeans.clusters))
            kernel_outcome[ii]['randind'].append(
                kmeans.rand_index_score(kmeans.clusters, q[1]))
        elif p == 'circle':
            gamma = 10
            d = 3
            kmeans = kkMeans(k=2, kernel=ii, gamma=gamma).fit_predict(q[0])
            kernel_outcome[ii]['acc'].append(
                kmeans.accuracy(q[1], kmeans.clusters))