def get_clusters(): kernel = ['linear', 'cosine', 'sigmoid', 'polynomial'] for ii in np.arange(32, 37, 1): for ij in kernel: pca = kPCA(k = ii, kernel = ij).fit(np.array(simscore)) pca = pca.components_.T km = kkMeans(k = ii, kernel = ij, gamma = 1).fit_predict(pca) cluster_labels = km.clusters if not os.path.exists(os.path.join(path, 'labels')): os.makedirs(os.path.join(path, 'labels')) pd.DataFrame(cluster_labels).to_csv(os.path.join(path, f'labels/labels_{ii}_{ij}.csv')) else: pd.DataFrame(cluster_labels).to_csv(os.path.join(path, f'labels/labels_{ii}_{ij}.csv'))
def update_figure(make_selection, g_m, knl, drop, yaxis, clust): # data_places = data[(data.year_edited >= make_selection[0]) & (data.year_edited <= make_selection[1])] ts = pd.read_csv(os.path.join(path, f'tsne/tsne_{int(clust)}.csv')).iloc[:, 1:] ts = ts.sort_values(by=['year']) data_places = ts[(ts.year >= make_selection[0]) & (ts.year <= make_selection[1])] if g_m == 'Cluster': if drop != []: traces = [] for val in drop: traces.append(go.Scattergl( x = np.array(data_places.loc[data_places.year == int(val), '0']), y = np.array(data_places.loc[data_places.year == int(val), '1']), text = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\ data_places.loc[data_places['year'] == int(val), 'pdf_names'].apply(lambda x: x.split('.')[0]),\ data_places.loc[data_places['year'] == int(val), 'year'],\ data_places.loc[data_places['year'] == int(val), 'language'],\ data_places.loc[data_places['year'] == int(val), 'authors'],\ data_places.loc[data_places['year'] == int(val), 'title'])], customdata = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\ data_places.loc[data_places['year'] == int(val), 'pdf_names'].apply(lambda x: x.split('.')[0]),\ data_places.loc[data_places['year'] == int(val), 'year'],\ data_places.loc[data_places['year'] == int(val), 'language'],\ data_places.loc[data_places['year'] == int(val), 'authors'],\ data_places.loc[data_places['year'] == int(val), 'title'])], mode = 'markers', opacity = 0.6, marker = {'size': 15, 'line': {'width': 0.5, 'color': 'white'}}, name = val, )) return { 'data': traces, 'layout': go.Layout(xaxis={'title': 'tsne-2'}, yaxis={ 'type': 'linear' if yaxis == 'Linear' else 'log', 'title': 'tsne-1' }, margin={ 'l': 40, 'b': 40, 't': 10, 'r': 10 }, legend={ 'x': 1, 'y': 1 }, hovermode='closest') } else: pca = kPCA(k=int(clust), kernel=knl).fit(np.array(simscore)) pca = pca.components_.T km = kkMeans(k=int(clust), kernel=knl, gamma=1).fit_predict(pca) cluster_labels = km.clusters ts = pd.read_csv(os.path.join( path, f'tsne/tsne_{int(clust)}.csv')).iloc[:, 1:] ts = ts[(ts.year >= make_selection[0]) & (ts.year <= make_selection[1])] traces = go.Scattergl( x = np.array(ts)[:, 0], y = np.array(ts)[:, 1], text = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\ ts['pdf_names'].apply(lambda x: x.split('.')[0]),\ ts['year'],\ ts['language'],\ ts['authors'],\ ts['title'])], customdata = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\ ts['pdf_names'].apply(lambda x: x.split('.')[0]),\ ts['year'],\ ts['language'],\ ts['authors'],\ ts['title'])], mode = 'markers', opacity = 0.7, marker = {'size': 15, # 'opacity': 0.9, 'color': cluster_labels, 'colorscale':'Viridis', 'line': {'width': .5, 'color': 'white'}}, ) return { 'data': [traces], 'layout': go.Layout(height=600, xaxis={'title': 'tsne-2'}, yaxis={ 'type': 'linear' if yaxis == 'Linear' else 'log', 'title': 'tsne-1' }, margin={ 'l': 40, 'b': 40, 't': 10, 'r': 10 }, legend={ 'x': 1, 'y': 1 }, hovermode='closest') } else: ss = np.array(simscore) m, n = ss.shape G = nx.Graph() for n in range(m): G.add_node(n) for i in range(m): for j in range(n): if ss[i, j] != 0 and i != j: G.add_edge(i, j) E = [edg for edg in G.edges] pos = nx.fruchterman_reingold_layout(G) Xv = [pos[k][0] for k in range(n)] Yv = [pos[k][1] for k in range(n)] Xed = [] Yed = [] for edge in E: Xed += [pos[edge[0]][0], pos[edge[1]][0], None] Yed += [pos[edge[0]][1], pos[edge[1]][1], None] etrace = go.Scattergl(x=Xed, y=Yed, mode='lines', line=dict(color='rgb(210,210,210)', width=.5), hoverinfo='none') vtrace = go.Scattergl( x=Xv, y=Yv, mode='markers', name='net', marker=dict(symbol='circle-dot', size=5, color='#6959CD', line=dict(color='rgb(50,50,50)', width=0.5)), # text = labels, hoverinfo='text') return { 'data': [etrace, vtrace], 'layout': go.Layout(height=600, xaxis={'title': 'year'}, yaxis={ 'type': 'linear' if yaxis == 'Linear' else 'log', 'title': 'Similarity score' }, margin={ 'l': 40, 'b': 40, 't': 10, 'r': 10 }, legend={ 'x': 1, 'y': 1 }, hovermode='closest') }
'variance': [] }, 'etakernel': { 'time': [], 'variance': [] }, 'laplace': { 'time': [], 'variance': [] } } for p, q in data_name.items(): for ii in kernels: start = time.time() kpca = kPCA(k=2, kernel=ii).fit(q) end = time.time() - start kernel_outcome[ii][f'{p}'] = kpca.fit_transform() kernel_outcome[ii]['time'].append(end) kernel_outcome[ii]['variance'].append(kpca.explained_variance) #pca = kPCA(k = 2, kernel = kernels[0]).fit(dfclass) #newX = pca.fit_transform() #plt.scatter(newX[:, 0], newX[:, 1], c = yclass, s = 3) #%% Visualize dataset fig, ax = plt.subplots(5, 1, figsize=(2, 7), gridspec_kw=dict(hspace=0.01, wspace=0.01),
def fit_predict(self, ds_x = None, ds_y = None, dt_x = None, \ dt_y = None, d = None, type = None, m_kernel = None): '''Domain Adaptation using Subspace Alignment :param: ds_x: NxD :param: ds_y: Dx1 :param: dt_x: NxD :param: dt_y: Dx1 :param: d: Number of principal components ''' if ds_x is None: raise IOError('Source Input data in required') else: self.ds_x = ds_x if ds_y is None: raise IOError('Source Input labels in required') else: self.ds_y = ds_y.ravel() if dt_x is None: raise IOError('Target Input data in required') else: self.dt_x = dt_x if dt_y is None: raise IOError('Target Input labels in required') else: self.dt_y = dt_y.ravel() if d is None: d = 2 self.d = d else: self.d = d if not m_kernel: m_kernel = 'linear' self.m_kernel = m_kernel else: self.m_kernel = m_kernel #ignore warning when scaling data using MinMaxScaler warnings.filterwarnings('ignore', category=DataConversionWarning) #find PCA for Source domain after scaling X_w = MinMaxScaler().fit_transform(self.ds_x).astype( float) #scale source data if not type: X_s = kPCA(k=self.d, kernel=self.m_kernel).fit(X_w.T) #perform PCA else: X_s = PCA(k=self.d).fit(X_w) X_s = X_s.components_.T #get components #PCA for target domain after scaling X_d = MinMaxScaler().fit_transform(self.dt_x).astype( float) #scale target data if not type: X_t = kPCA(k=self.d, kernel=self.m_kernel).fit(X_d.T) #perform PCA else: X_t = PCA(k=self.d).fit(X_d) self.X_t = X_t.components_.T #get components #compute source and target projections using subspace alignment matrix self.X_a = X_s.dot(X_s.T.dot(self.X_t)) self.S_a = self.ds_x.dot(self.X_a) #source projection self.T_a = self.dt_x.dot(self.X_t) #target projection print(f'>>>> Done with Subspace alingment and Data projection >>>>') #perform classification ''' Fit a 1-NN classifier on S_a and make predictions on T_a ''' print('*' * 40) print('Initializing 1-Nearest Neighbour classifier') self.classifier = KNeighborsClassifier(n_neighbors=1) self.classifier.fit(self.S_a, self.ds_y) print('>>>> Done fitting source domain >>>>') self.ypred = self.classifier.predict(self.T_a) self.accuracy = EvalC.accuary_multiclass(self.dt_y, self.ypred) print(f'Accuracy: {self.accuracy}') return self