def __init__(self, datos, n_componentes = 5): self.__datos = datos self.__modelo = PCA(n_components = n_componentes).fit(self.__datos) self.__correlacion_var = self.__modelo.column_correlations(datos) self.__coordenadas_ind = self.__modelo.row_coordinates(datos) self.__contribucion_ind = self.__modelo.row_contributions(datos) self.__cos2_ind = self.__modelo.row_cosine_similarities(datos) self.__var_explicada = [x * 100 for x in self.__modelo.explained_inertia_]
def get_pca(self, components: int = 3): data = self.df results = dict() pca = PCA(n_components=components, n_iter=100, rescale_with_mean=True, rescale_with_std=True, copy=True, check_input=True) results['fit'] = pca.fit(data) results['rotated'] = pca.fit_transform(data) results['feature_correlations'] = fit.column_correlations(data) return results
class DFPCA(BaseEstimator, TransformerMixin): # NOTE: # - DFPCA(n_components=df.shape[1]) to remain every dimensions # - DFPCA(rescale_with_mean=False, rescale_with_std=False) to avoid using built-in StandardScaler() def __init__(self, columns=None, prefix='pca_', **kwargs): self.columns = columns self.prefix = prefix self.model = PCA(**kwargs) self.transform_cols = None self.stat_df = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols]) # Reference: Reference: https://www.appliedaicourse.com/lecture/11/applied-machine-learning-online-course/2896/pca-for-dimensionality-reduction-not-visualization/0/free-videos self.stat_df = pd.DataFrame({ 'dimension': [x+1 for x in range(len(self.model.eigenvalues_))], 'eigenvalues': self.model.eigenvalues_, 'explained_inertia': self.model.explained_inertia_, 'cumsum_explained_inertia': np.cumsum(self.model.explained_inertia_) }) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") new_X = self.model.transform(X[self.transform_cols]) new_X.rename(columns=dict(zip(new_X.columns, [f'{self.prefix}{x}' for x in new_X.columns])), inplace=True) new_X = pd.concat([X.drop(columns=self.transform_cols), new_X], axis=1) return new_X def fit_transform(self, X, y=None): return self.fit(X).transform(X)
def setup_class(cls): # Load a dataframe dataframe = pd.read_csv('tests/data/decathlon.csv', index_col=0) # Determine the categorical columns cls.df_categorical = dataframe.select_dtypes(exclude=[np.number]) # Determine the numerical columns cls.df_numeric = dataframe.drop(cls.df_categorical.columns, axis='columns') # Determine the size of the numerical part of the dataframe (cls.n, cls.p) = cls.df_numeric.shape # Determine the covariance matrix X = cls.df_numeric.copy() cls.center_reduced = ((X - X.mean()) / X.std()).values cls.cov = cls.center_reduced.T @ cls.center_reduced # Calculate a full PCA cls.n_components = len(cls.df_numeric.columns) cls.pca = PCA(dataframe, n_components=cls.n_components, scaled=True)
def model_interpretation(self, patient_id, patient_preprocessed, pred, prob, model): ''' Fazer gráficos avaliativos do modelo. Argumentos: patient_id = string referente a identificação do paciente patient_preprocessed = dicionario contendo dados do exame do paciente pred = classe predita pelo modelo prob = probabilidade referente a classe predita pelo modelo model = objeto do modelo ''' #### Pegar variaveis necessárias para o plot (import csv) #### Nome dos plots plot_1_name = 'app/ai_models/temp/probacurve-' + str( patient_id) + '.png' plot_2_name = 'app/ai_models/temp/shap-' + str(patient_id) + '.png' plot_3_name = 'app/ai_models/temp/dist-' + str(patient_id) + '.png' plot_4_name = 'app/ai_models/temp/mapa-' + str(patient_id) + '.png' #URL API PLOTS plot_1_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/probacurve-" + str( patient_id) + ".png" plot_2_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/shap-" + str( patient_id) + ".png" plot_3_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/dist-" + str( patient_id) + ".png" plot_4_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/mapa-" + str( patient_id) + ".png" #### Configurações gerais do plt DPI_IMAGES = 100 FONT_SIZE = 8 FONT_NAME = 'sans-serif' plt.rc('font', family=FONT_NAME, size=FONT_SIZE) plt.rc('axes', titlesize=FONT_SIZE, labelsize=FONT_SIZE) plt.rc('xtick', labelsize=FONT_SIZE) plt.rc('ytick', labelsize=FONT_SIZE) plt.rc('legend', fontsize=FONT_SIZE) #### PLOT 1 - Distribuição da probabilidade dada pelo modelo para pacientes positivos # Itens Necessário: self.probs_df(csv importado) e pred exame_resp = pred exame_prob = prob # Plot fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(5, 5)) sns.kdeplot(self.probs_df['prob_neg'], shade=True, color='#386796', ax=axis, linestyle="--", label='Casos Negativos') sns.kdeplot(self.probs_df['prob_pos'], shade=True, color='#F06C61', ax=axis, label='Casos positivos') # Pegar eixo XY do Plt object para fazer a interpolação if exame_resp == 0: xi = 1 - exame_prob data_x, data_y = axis.lines[0].get_data() elif exame_resp == 1: xi = exame_prob data_x, data_y = axis.lines[1].get_data() # Fazer a interpolação e plot yi = np.interp(xi, data_x, data_y) axis.plot([xi], [yi], linestyle='None', marker="*", color='black', markersize=10, label='Paciente') # Outras configuracoes do plot axis.legend(loc="upper right") #axis.set_title('Probabilidade de ser COVID Positivo pelo modelo', fontweight='bold') axis.set_xlim([0, 1]) axis.set_ylim([0, axis.get_ylim()[1]]) plt.tight_layout() # Salvar plot 1 plt.savefig(plot_1_name, dpi=DPI_IMAGES, bbox_inches='tight', pad_inches=0.1) plt.close() #### PLOT 2 - SHAP # Necessário: patient_preprocessed, pred e model features = np.array(list(patient_preprocessed.keys())) sample_x = np.array(list(patient_preprocessed.values())) # Calcular SHAP Value explainer = TreeExplainer(model=model) # Faz o objeto SHAP shap_values_sample = explainer.shap_values(sample_x) # Calculo do SHAP expected_value = explainer.expected_value[ exame_resp] # Pega o baseline para a classe predita pelo modelo shap_values_sample = explainer.shap_values( sample_x) # Calcular os SHAP values # Plot #plt.title('Valores SHAP', fontweight='bold') waterfall_plot(expected_value, shap_values_sample[exame_resp], sample_x, feature_names=features, max_display=20, show=False) # Salvar imagem plt.tight_layout() plt.savefig(plot_2_name, dpi=DPI_IMAGES, bbox_inches='tight', pad_inches=0) plt.close() #### PLOT 3 - Distribuição das variáveis mais importantes para o modelo # Necessário: self.train_df(csv importado), patient_preprocessed, pred important_features = [ 'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos' ] target_0 = self.train_df[self.train_df['target'] == 0][[ 'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos' ]] target_1 = self.train_df[self.train_df['target'] == 1][[ 'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos' ]] # Plot fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5)) # Plot settings #sns.set_color_codes() #st = fig.suptitle("Distribuição das variáveis importantes para o modelo", fontweight='bold') #st.set_y (1.05) # Index col/row r = 0 c = 0 # Loop to plot for feat in important_features: # Plot distribuição sns.kdeplot(list(target_0[feat]), shade=True, color='#386796', ax=axes[r][c], label='Casos Negativos', linestyle="--") sns.kdeplot(list(target_1[feat]), shade=True, color='#F06C61', ax=axes[r][c], label='Casos positivos') # Pegar a curva de densidade a partir do resultado do modelo if pred == 0: data_x, data_y = axes[r][c].lines[0].get_data() elif pred == 1: data_x, data_y = axes[r][c].lines[1].get_data() # Pegar a informação (valor) daquela variável importante xi = patient_preprocessed[feat] yi = np.interp(xi, data_x, data_y) ## Plot ponto na curva axes[r][c].plot([xi], [yi], linestyle='None', marker="*", color='black', markersize=10, label='Paciente') axes[r][c].set_title(feat) axes[r][c].legend(loc="upper right") axes[r][c].set_ylim([0, axes[r][c].get_ylim()[1]]) # Mudar onde sera plotado if c == 0: c += 1 else: r += 1 c = 0 # Ajeitar o plot plt.tight_layout() # Salvar imagem plt.savefig(plot_3_name, dpi=DPI_IMAGES, bbox_inches='tight', pad_inches=0.1) plt.close() #### PLOT 4 - Mapa com SVD para os pacientes # Necessário: train_df(csv importado), patient_preprocessed amostra = pd.DataFrame(patient_preprocessed, index=[ 0, ]).drop(axis=1, columns=['Outra gripe']) # Fazer PCA com SVD via prince package y_train = self.train_df['target'] # Salvar coluna target dados = self.train_df.drop( axis=1, columns=['Outra gripe', 'target']).copy() # Dataset para criar o mapa pca_obj = PCA(n_components=2, random_state=42) # Objeto do PCA pca_obj.fit(dados) # Fit no conjunto de dados componentes = pca_obj.transform( dados) # Criar os componentes principais dos dados transf = pca_obj.transform(amostra) # Transformar paciente para PCA xi = transf.loc[0, 0] # Eixo X do paciente para plot yi = transf.loc[0, 1] # Eixo Y do paciente para plot comp = pd.DataFrame() # Dataframe para conter os componentes comp['C1'] = componentes[0] # Componente Principal 1 comp['C2'] = componentes[1] # Componente Principal 2 comp['TG'] = y_train # Variável target para a mascara comp_0 = comp[comp['TG'] == 0][['C1', 'C2' ]] # Dataframe de CP para negativos comp_1 = comp[comp['TG'] == 1][['C1', 'C2' ]] # Dataframe de CP para positivos # Plot fig, ax = plt.subplots(figsize=(8, 8)) plt.margins(0, 0) sns.scatterplot(ax=ax, data=comp_0, x='C1', y='C2', color='#386796', label='Casos Negativos') sns.scatterplot(ax=ax, data=comp_1, x='C1', y='C2', color='#F06C61', label='Casos Positivos') x_mean, y_mean, width, height, angle = self.build_ellipse( comp_0['C1'], comp_0['C2']) ax.add_patch( Ellipse((x_mean, y_mean), width, height, angle=angle, linewidth=2, color='#386796', fill=True, alpha=0.2)) x_mean, y_mean, width, height, angle = self.build_ellipse( comp_1['C1'], comp_1['C2']) ax.add_patch( Ellipse((x_mean, y_mean), width, height, angle=angle, linewidth=2, color='#F06C61', fill=True, alpha=0.2)) ax.plot([xi], [yi], linestyle='None', marker="*", color='black', markersize=10, label='Paciente') # Configurações do plot #ax.set_title('Similaridade entre pacientes',fontweight='bold') ax.set_xticks([]) ax.set_yticks([]) ax.set_ylabel('') ax.set_xlabel('') handles, labels = ax.get_legend_handles_labels() labels, handles = zip( *sorted(zip(labels, handles), key=lambda t: t[0])) ax.legend(handles, labels, loc="upper right") # Salvar imagem plt.axis('off') plt.savefig(plot_4_name, dpi=DPI_IMAGES, bbox_inches='tight', pad_inches=0) plt.close() # Retornar model_result = { 'prediction': pred, 'probability': str(round(prob * 100, 2)), 'probacurve': plot_1_api, 'shap_img': plot_2_api, 'dist_img': plot_3_api, 'mapa_img': plot_4_api } return model_result """
def pca(load_df, k): """The executed PCA.""" return PCA(load_df, n_components=k, scaled=True)
class ACP: def __init__(self, datos, n_componentes = 5): self.__datos = datos self.__modelo = PCA(n_components = n_componentes).fit(self.__datos) self.__correlacion_var = self.__modelo.column_correlations(datos) self.__coordenadas_ind = self.__modelo.row_coordinates(datos) self.__contribucion_ind = self.__modelo.row_contributions(datos) self.__cos2_ind = self.__modelo.row_cosine_similarities(datos) self.__var_explicada = [x * 100 for x in self.__modelo.explained_inertia_] @property def datos(self): return self.__datos @datos.setter def datos(self, datos): self.__datos = datos @property def modelo(self): return self.__modelo @property def correlacion_var(self): return self.__correlacion_var @property def coordenadas_ind(self): return self.__coordenadas_ind @property def contribucion_ind(self): return self.__contribucion_ind @property def cos2_ind(self): return self.__cos2_ind @property def var_explicada(self): return self.__var_explicada @var_explicada.setter def var_explicada(self, var_explicada): self.__var_explicada = var_explicada def plot_plano_principal(self, ejes = [0, 1], ind_labels = True, titulo = 'Plano Principal'): x = self.coordenadas_ind[ejes[0]].values y = self.coordenadas_ind[ejes[1]].values plt.subplots(figsize=(13, 13)) plt.style.use('seaborn-whitegrid') plt.scatter(x, y, color = 'gray') plt.title(titulo) plt.axhline(y = 0, color = 'dimgrey', linestyle = '--') plt.axvline(x = 0, color = 'dimgrey', linestyle = '--') inercia_x = round(self.var_explicada[ejes[0]], 2) inercia_y = round(self.var_explicada[ejes[1]], 2) plt.xlabel('Componente ' + str(ejes[0]) + ' (' + str(inercia_x) + '%)') plt.ylabel('Componente ' + str(ejes[1]) + ' (' + str(inercia_y) + '%)') if ind_labels: for i, txt in enumerate(self.coordenadas_ind.index): plt.annotate(txt, (x[i], y[i])) def plot_circulo(self, ejes = [0, 1], var_labels = True, titulo = 'Círculo de Correlación'): cor = self.correlacion_var.iloc[:, ejes].values plt.style.use('seaborn-whitegrid') c = plt.Circle((0, 0), radius = 1, color = 'steelblue', fill = False) plt.subplots(figsize=(13, 13)) plt.gca().add_patch(c) plt.axis('scaled') plt.title(titulo) plt.axhline(y = 0, color = 'dimgrey', linestyle = '--') plt.axvline(x = 0, color = 'dimgrey', linestyle = '--') inercia_x = round(self.var_explicada[ejes[0]], 2) inercia_y = round(self.var_explicada[ejes[1]], 2) plt.xlabel('Componente ' + str(ejes[0]) + ' (' + str(inercia_x) + '%)') plt.ylabel('Componente ' + str(ejes[1]) + ' (' + str(inercia_y) + '%)') for i in range(cor.shape[0]): plt.arrow(0, 0, cor[i, 0] * 0.95, cor[i, 1] * 0.95, color = 'steelblue', alpha = 0.5, head_width = 0.05, head_length = 0.05) if var_labels: plt.text(cor[i, 0] * 1.05, cor[i, 1] * 1.05, self.correlacion_var.index[i], color = 'steelblue', ha = 'center', va = 'center') def plot_sobreposicion(self, ejes = [0, 1], ind_labels = True, var_labels = True, titulo = 'Sobreposición Plano-Círculo'): x = self.coordenadas_ind[ejes[0]].values y = self.coordenadas_ind[ejes[1]].values cor = self.correlacion_var.iloc[:, ejes] scale = min((max(x) - min(x)/(max(cor[ejes[0]]) - min(cor[ejes[0]]))), (max(y) - min(y)/(max(cor[ejes[1]]) - min(cor[ejes[1]])))) * 0.7 cor = self.correlacion_var.iloc[:, ejes].values plt.subplots(figsize=(13, 13)) plt.style.use('seaborn-whitegrid') plt.axhline(y = 0, color = 'dimgrey', linestyle = '--') plt.axvline(x = 0, color = 'dimgrey', linestyle = '--') inercia_x = round(self.var_explicada[ejes[0]], 2) inercia_y = round(self.var_explicada[ejes[1]], 2) plt.xlabel('Componente ' + str(ejes[0]) + ' (' + str(inercia_x) + '%)') plt.ylabel('Componente ' + str(ejes[1]) + ' (' + str(inercia_y) + '%)') plt.scatter(x, y, color = 'gray') if ind_labels: for i, txt in enumerate(self.coordenadas_ind.index): plt.annotate(txt, (x[i], y[i])) for i in range(cor.shape[0]): plt.arrow(0, 0, cor[i, 0] * scale, cor[i, 1] * scale, color = 'steelblue', alpha = 0.5, head_width = 0.05, head_length = 0.05) if var_labels: plt.text(cor[i, 0] * scale * 1.15, cor[i, 1] * scale * 1.15, self.correlacion_var.index[i], color = 'steelblue', ha = 'center', va = 'center')
def run(): # read command line arguments arg_parser = get_arg_parser() args = arg_parser.parse_args() # create output folder if it doesn't exist os.makedirs(args.path, exist_ok=True) print("Reading data...") # read all csv choices files dfs = [pd.read_csv(file, header=[0, 1]) for file in args.files] # if more than one file was read, concat them if len(dfs) > 1: # concat their columns choices = pd.concat(dfs, axis=1, join='inner') # remove duplicate columns choices = choices.loc[:, ~choices.columns.duplicated()] else: choices = dfs[0] # remove columns that will not be used choices.drop(columns=['drafter', 'entropy'], level=0, inplace=True) # get list of drafters from remaining columns drafters = list(choices.columns.get_level_values(0).unique()) print("Processing data...") # concat 1st and 2nd players' choices into a new dataframe temp = pd.DataFrame(index=range(len(choices.index) * 2), columns=drafters) for drafter in drafters: drafter_columns = [ choices[(drafter, '1st')], choices[(drafter, '2nd')] ] temp[drafter] = pd.concat(drafter_columns, ignore_index=True) # discard original dataframe in favor of the new one choices = temp print("Calculating similarities...") # initialize the similarities dataframe similarities = pd.DataFrame(index=drafters, columns=drafters) # populate the similarities dataframe total_rows = len(choices.index) for drafter1 in drafters: # the similarity of a drafter and itself is of 100% similarities[drafter1][drafter1] = 1.0 for drafter2 in drafters: if drafter1 == drafter2: continue # calculate amount of equal choices equal_rows = (choices[drafter1] == choices[drafter2]).sum() # calculate similarity similarity = equal_rows / total_rows # update appropriate cells in the dataframe similarities[drafter2][drafter1] = similarity similarities[drafter1][drafter2] = similarity # save similarities dataframe to files similarities.to_pickle(args.path + '/similarities.pkl') similarities.to_csv(args.path + '/similarities.csv') print("Applying PCA...") # create mapping between choices and equidistant points in a circumference choices_to_points = { 0: math.sin(30), 1: math.sin(120), 2: math.sin(210), 3: math.cos(30), 4: math.cos(120), 5: math.cos(210) } # double the amount of rows to store the points' x and y choices = pd.concat([choices, choices + 3]) # map choices to points choices = choices.applymap(choices_to_points.__getitem__) # apply PCA down to 3 or 3 dimensions pca = PCA(n_components=args.dimensions, random_state=824) coords = pca.fit_transform(choices.T) coords.columns = ['x', 'y', 'z'] if args.dimensions == 3 else ['x', 'y'] print("Applying k-means...") # apply K-Means to original choices data, finding the optimal value of k # with the average silhouette method silhouettes, clusterings = [], [] for k in range(2, len(drafters)): print(f"Trying k={k}", end="") kmeans = KMeans(n_clusters=k, random_state=824).fit(coords) silhouette = silhouette_score(coords, kmeans.labels_, random_state=824) silhouettes.append(silhouette) clusterings.append(kmeans.labels_) print(f", silhouette={silhouette}, labels={kmeans.labels_}") best_k = np.argmax(silhouettes) + 2 labels = clusterings[best_k - 2] print(f"Best k: {best_k}. Labels: {labels}") # color the drafters according to their cluster all_colors = [ 'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan' ] coords['color'] = [all_colors[cluster_id] for cluster_id in labels] print("All done.") # rename agents for i in range(len(drafters)): tokens = drafters[i].split('/') if len(tokens) > 1: battler = {'max-attack': 'MA', 'greedy': 'GR'}[tokens[-3]] drafter = tokens[-2] drafters[i] = f"{drafter}/{battler}" print("columns") print(coords.columns) # normalize the axes for axis in coords.columns[:-1]: coords[axis] -= coords[axis].min() coords[axis] /= coords[axis].max() print(coords) if args.dimensions == 3: # plot the PCA coordinates in 3D fig = plt.figure() ax = fig.add_subplot(111, projection='3d') objs = [] for name, x, y, z, color in coords.itertuples(): objs.append(ax.scatter(x, y, z, marker='o', c=color, label=name)) plt.legend(objs, drafters, ncol=3, fontsize=8, loc='upper left') ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') else: # plot the PCA coordinates in 2D plt.subplots_adjust(bottom=0.1) for name, x, y, color in coords.itertuples(): plt.scatter(x, y, marker='o', label=name, c=color) for label, x, y in zip(drafters, coords['x'], coords['y']): plt.annotate(label, xy=(x, y), xytext=(-20, 20), textcoords='offset points', ha='right', va='bottom', bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0')) plt.savefig(args.path + f'/similarities{"3D" if args.dimensions == 3 else ""}.png') plt.show() print("✅")
def __init__(self, columns=None, prefix='pca_', **kwargs): self.columns = columns self.prefix = prefix self.model = PCA(**kwargs) self.transform_cols = None self.stat_df = None