def PlotCodeFrequency(codes, labels, save=False, path='', filename='img'): """ Plot the frequency of each code on the dataset and also which ones are DI or DP. - codes (pandas Series, list, numpy array): codes of each sample. - labels (pandas Series, list, numpy array): labels of each sample. - save (bool): tells if the plot should be saved. - path (string): path where to save the figure. - filename (string): name of the figure image file to be saved. """ DI = 0 DP = 1 df = pd.DataFrame() df['labels'] = labels df['codes'] = codes freq_di = dict() freq_dp = dict() total = len(df) N = 0 toPercentage = lambda value, total: (value * 100.0) / total codigosExistentes = list(df['codes'].unique()) codigosExistentes.sort() for c in codigosExistentes: freq_di[c] = toPercentage( ((df['codes'] == c) & (df['labels'] == DI)).sum(), total) freq_dp[c] = toPercentage( ((df['codes'] == c) & (df['labels'] == DP)).sum(), total) N += 1 ind = np.arange(N) width = 0.5 fig = plt.figure(figsize=(11, 5)) dp_bar = plt.bar(ind, list(freq_dp.values()), width, figure=fig) di_bar = plt.bar(ind, list(freq_di.values()), width, bottom=list(freq_dp.values()), figure=fig) minorTicks = MultipleLocator(1) plt.ylabel('Porcentagem (%)') plt.xlabel('Códigos') plt.title('Frequência de cada código no dataset') plt.xticks(ind, tuple(freq_di.keys())) plt.yticks(np.arange(0, 25, 5)) plt.axes().yaxis.set_minor_locator(minorTicks) plt.legend((di_bar[0], dp_bar[0]), ('DI', 'DP')) plt.grid(True, which='both', axis='y') plt.show() if (save): util.CheckAndCreatePath(path) util.SaveFigure(fig, path, filename)
def PlotAccuracyOfEachEventCode(codes, labels, prediction, save=False, path='', filename='img'): """ Plots the model accuracy for each event code. - codes (pandas Series, list, numpy array): codes of each sample. - labels (pandas Series, list, numpy array): labels of each sample. - prediction (pandas Series, list, numpy array): predictions of each sample. - save (bool): tells if the plot should be saved. - path (string): path where to save the figure. e.g.: 'images/' - filename (string): name of the figure image file to be saved. """ df = pd.DataFrame() df['labels'] = labels df['codes'] = codes df['prediction'] = prediction cods = df['codes'].unique() cods = np.sort(cods) cods = cods[np.invert(np.isnan(cods))] cods = cods.astype(int) toPercentage = lambda value, total: (value * 100.0) / total accuracy = list() for cod in cods: right = ((df['prediction'] == df['labels']) & (df['codes'] == cod)).sum() total = (df['codes'] == cod).sum() percentage = toPercentage(right, total) accuracy.append(percentage) # Plotar graficos fig, ax = plt.subplots(figsize=(11, 5)) xticks = list(range(0, len(cods))) yticks = list(range(0, 101, 10)) # show the figure, but do not block plt.bar(xticks, accuracy, figure=fig, align='center', width=0.3) ax.tick_params(axis='y', gridOn=True) ax.set_xticks(xticks) ax.set_xticklabels(cods) ax.set_yticks(yticks) ax.set_ylim([0, 100]) ax.set_ylabel('Acurácia (%)') ax.set_xlabel('Codigo') ax.set_title('Acurácia do modelo para cada código') plt.show(block=False) if (save): util.CheckAndCreatePath(path) util.SaveFigure(fig, path, filename)
def OneHotEncode(df, columns = None, save = False, load = False, path = ''): """ Performs One-Hot encoding on specified columns or in all of them. - df (pandas dataframe): dataframe containing the columns to one-hot encode - columns (list of strings/numbers) : list of columns names to one-hot encode - save (bool): bool that tells if the encoder must be saved. - load (bool): bool that tells if the encoder must be loaded. - path (string): path where the encoder must be saved at or loaded from. Return (pandas dataframe): dataframe with specified columns label encoded. """ if isinstance(df,pd.Series): df = df.to_frame() if columns is None: columns = list(df.columns) # if directory doesnt exist, create it util.CheckAndCreatePath(path) # Perform Label encode on the columns df_enc, labelEncoders = LabelEncode(df, columns = columns, save = save, load = load, path = path, returnEncoders=True) # Perform One-hot enconde on the columns cont = 0 for column in columns: if load: encoder = pickle.load(open(path + "one_hot_encoder_" + str(column) + ".pickle.dat", "rb")) else: encoder = preprocessing.OneHotEncoder() oneHotEncoded = encoder.fit_transform(df_enc[column].to_frame()) # Create datafame with the one-hot encoded features columns_transformed = [str(column)+'_'+str(i) for i in labelEncoders[cont].classes_] oneHotEncoded = pd.DataFrame(oneHotEncoded.toarray(), columns=columns_transformed) # Put one-hot columns on the right position of the dataset and delete old feaure df_enc = util.InsertColumnsOnADataframePosition(df_enc, oneHotEncoded, list(df_enc.columns).index(column)) df_enc = df_enc.drop(columns=[column]) # Save encoder if save: pickle.dump(encoder, open(path + "one_hot_encoder_" + str(column) + ".pickle.dat", "wb")) cont += 1 return df_enc
def PlotFeatureImportanceXGBoost(model, save=False, path='', filename='img'): """ Plots the importance of each feature from a XGBoost model. - model (XGBoost model): model. - save (bool): tells if the plot should be saved. - path (string): path where to save the figure. - filename (string): name of the figure image file to be saved. """ fig, ax = plt.subplots(figsize=(6, 7)) xgb.plot_importance(model, ax=ax) plt.show() if (save): util.CheckAndCreatePath(path) util.SaveFigure(fig, path, filename)
def LabelEncode(df, columns = None, save = False, load = False, path = '', returnEncoders = False): """ Performs label encoding on specified columns or in all of them. - df (pandas dataframe/ pandas Series): dataframe containing the columns to label encode, or series with only one column. - columns (list of strings/numbers) : list of columns names to label encode - save (bool): tells if the encoder must be saved. - load (bool): tells if the encoder must be loaded. - path (string): path where the encoder must be saved at or loaded from. - returnEncoders (bool): tells if the encoder should be returned too. Return (pandas dataframe, list of LabelEncoders): dataset with specified columns label encoded and the Label Encoder itself in case returnEncoder is True. """ if isinstance(df,pd.Series): df = df.to_frame() if columns is None: columns = list(df.columns) # if directory doesnt exist, create it util.CheckAndCreatePath(path) encoders = list() for column in columns: # Load or create Label Encoder if load: encoder = pickle.load(open(path + "label_encoder_" + str(column) + ".pickle.dat", "rb")) else: encoder = preprocessing.LabelEncoder() df[column] = encoder.fit_transform(df[column]) encoders.append(encoder) # Save encoder if save: pickle.dump(encoder, open(path + "label_encoder_" + str(column) + ".pickle.dat", "wb")) if returnEncoders: return df, encoders return df
def ApplyPCA(X, columns, save=False, load=False, path = '', n_components = 9): ''' Apply PCA on specific columns on the dataset - X (pd.DataFrame): dataset with the columns where PCA should be applied. - columns (list of numbers): List of number Ids of columns to apply the PCA. - save (bool): bool that tells if the PCA transoformer should be saved. - load (bool): bool that tells if the PCA transoformer should be loaded. - path (string): path where the Vectorizer must be saved at or loaded from. - n_components (int): number of PCA components. Return X_train(pd.DataFrame): X_train with the transformed features. ''' # if directory doesnt exist, create it util.CheckAndCreatePath(path) # Apply PCA if load: pca = pickle.load(open(path + "pca_transformer.pickle.dat", "rb")) else: pca = PCA(n_components=n_components) selectedColumns = X.columns[columns] pcaFeatures = pca.fit_transform(X[selectedColumns]) columnsNames = list(range(pcaFeatures.shape[1])) pcaFeatures = pd.DataFrame(pcaFeatures, columns=columnsNames) # X_transformed = pd.concat([pcaFeatures, X.drop(selectedColumns, axis=1)], axis=1) X_transformed = util.ConcatenateDataframes(pcaFeatures, X.drop(selectedColumns, axis=1)) # Save PCA transformer if save: pickle.dump(pca, open(path + "pca_transformer.pickle.dat", "wb")) return X_transformed
def PlotConfusionMatrix(cm, cm_confidence_interval, classes, normalize=False, title='Matriz de Confusão', cmap=plt.cm.Blues, save=False, path='', imgname='matriz_de_confusão'): """Gera e mostra a matriz de confusão. Parameters ---------- cm : numpy.array Matriz de confusão. classes : list Lista com o nome de cada classe dos rótulos. Exemplo: ['DI', 'DP']. normalize : bool, default False Se verdadeiro, a matriz de confusão será normalizada. title : string, default 'Matriz de Confusão' Título da imagem da matriz de confusão. cmap : matplotlib.pyplot.cm, default matplotlib.pyplot.cm.Blues Colormap usado na matriz. save : bool, default False Se verdadeiro, a imagem da matriz de confusão será salva. path : str Diretório onde a imagem da matriz de confusão será salva. imgname : str, default 'img' Nome da imagem da matriz de confusão que será salva. """ if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Matriz de Confusão Normalizada") else: print('Matriz de Confusão, sem normalização') # formatted confusion matrix cm_format = np.copy(cm).astype(str) for i in range(cm.shape[0]): for j in range(cm.shape[1]): if normalize: if cm_confidence_interval is not None: cm_format[ i, j] = f'{cm[i,j]:.2f} ' + u"\u00B1" + f' {cm_confidence_interval[i,j]:.4f}' else: cm_format[i, j] = f'{cm[i,j]:.2f}' else: cm_format[i, j] = f'{cm[i,j]:.0f}' fig, ax = plt.subplots(figsize=(5, 5)) im = ax.imshow(cm, aspect='equal', interpolation='nearest', cmap=cmap) plt.title(title) cbar = ax.figure.colorbar(im, ax=ax, shrink=0.7) tick_marks = np.arange(len(classes)) ax.set_xticks(tick_marks) ax.set_yticks(tick_marks) ax.set_xticklabels(classes) ax.set_yticklabels(classes) ax.set_xticks(np.arange(cm.shape[1] + 1) - .5, minor=True) ax.set_yticks(np.arange(cm.shape[0] + 1) - .5, minor=True) ax.tick_params(axis='x', rotation=45) ax.tick_params(axis='both', labelsize=12) ax.set_xlabel('Classe Prevista', size=12) ax.set_ylabel('Classe Verdadeira', size=12) ax.titlesize = 13 thresh = (cm.max() + cm.min()) / 2. for i in range(cm.shape[0]): for j in range(cm.shape[1]): plt.text(j, i, cm_format[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black", fontsize=12) plt.tight_layout() ax.labelsize = 12 plt.grid(False) plt.show(block=False) if (save): util.CheckAndCreatePath(path) util.SaveFigure(fig, path, imgname)
def ColumnAsBagOfWords(column, regex = None, save = False, load = False, applyTFIDF = True, path = '', **kwargs): """ Encode text column as bag of words. - colum (pd.Series): vector containing the text data. - regex (raw string): string with the tokenizer pattern. If not passed, a pattern where everything but "_", "." and " " is accepted as word is assumed - save (bool): bool that tells if the Vectorizer must be saved. - load (bool): bool that tells if the Vectorizer must be loaded. - applyTDIDF (bool): bool that tells if td-idf (term frequency–inverse document frequency) should be applied - path (string): path where the Vectorizer must be saved at or loaded from. - kwags(dict): dictionary of CountVectorizer/TfidfTransformer parameters to set. Return (pd.Dataframe, CountVectorizer, TfidfTransformer): Resulting dataframe, Count Vectorizer (scikit-learn), Tfidf Transformer (scikit-learn) """ # if directory doesnt exist, create it util.CheckAndCreatePath(path) if regex is None: regex = r"[^_^.^-]+" ColumnName = column.name # Apply bag of words vectorization if load: count_vect = pickle.load(open(path + "count_vectorizer_" + str(ColumnName) + ".pickle.dat", "rb")) else: count_vect = CountVectorizer(token_pattern = regex) # Capture count vectorizer parameters from kwags try: count_vect.set_params(**kwargs) except: pass columTransformed = count_vect.fit_transform(column) # Apply tf-idf(term frequency–inverse document frequency) if applyTFIDF: if load: tf_idf = pickle.load(open(path + "tf_idf_" + str(ColumnName) + ".pickle.dat", "rb")) else: tf_idf = TfidfTransformer(norm='l1', use_idf=True) # Capture Tfidf Transformer parameters from kwags try: tf_idf.set_params(**kwargs) except: pass columTransformed = tf_idf.fit_transform(columTransformed) else: tf_idf = None # create vector of column names columnNames = [(str(ColumnName) + '-' + str(i)) for i in count_vect.vocabulary_.keys()] # Save vectorizer and tf-idf transformer if save: pickle.dump(count_vect, open(path + "count_vectorizer_" + str(ColumnName) + ".pickle.dat", "wb")) if applyTFIDF: pickle.dump(tf_idf, open(path + "tf_idf_" + str(ColumnName) + ".pickle.dat", "wb")) # Construct final dataframe of transformed column df = pd.DataFrame(columTransformed.toarray(), columns=columnNames) return df, count_vect, tf_idf
def Generate(model, modelName, codes, X_train, Y_train, X_test, Y_test, path='Reports/'): # Generate needed variables X = pd.concat([X_train, X_test]) Y = pd.concat([Y_train, Y_test]) predictions_test = model.predict(X_test) predictions_train = model.predict(X_train) data = datetime.now() nomeArquivo = 'relatorio' + data.strftime('_%d_%m_%Y-%H_%M_%S') + '.pdf' titulo = 'Relatório do Modelo' dataString = data.strftime('%d/%m/%Y %H:%M:%S') colunas = list(X_train.columns) ModelEvaluation.PlotFeatureImportanceXGBoost(model, save=True, path='images', filename='feature_importance') uniqueValues = EDA.UniqueValuesOnEachColumn(X) ClassBalance = EDA.CalculateClassBalance(Y.to_frame(), get=True) EDA.PlotCodeFrequency(codes, Y, save=True, path='images', filename='code_frequency') metrics_test = ModelEvaluation.EvaluateClassification( Y_test, predictions_test, 'classifier_scores_v3', save=True, path='images', imgname='confusion_matrix_test') ModelEvaluation.PlotAccuracyOfEachEventCode( codes[len(Y_train):], Y_test, predictions_test, save=True, path='images', filename='accuracy_by_code_test') metrics_train = ModelEvaluation.EvaluateClassification( Y_train, predictions_train, 'classifier_scores_v3', save=True, path='images', imgname='confusion_matrix_train') ModelEvaluation.PlotAccuracyOfEachEventCode( codes[:len(Y_train)], Y_train, predictions_train, save=True, path='images', filename='accuracy_by_code_train') # Create document # if directory doesnt exist, create it util.CheckAndCreatePath(path) c = canvas.Canvas(path + nomeArquivo, pagesize=A4) global WIDTH global HEIGHT global MARGIN_Y global MARGIN_X global LIN_HEIGHT WIDTH, HEIGHT = A4 MARGIN_Y = 2.54 * cm MARGIN_X = 1.5 * cm LIN_HEIGHT = 0.8 * cm c.translate(MARGIN_X, 0) cursor = HEIGHT - MARGIN_Y ########## # Page 1 # ########## # Title c.saveState() c.translate(0, cursor) c.setFont("Helvetica", 20) c.setFillColorRGB(0, 0, 0) x = (WIDTH - 2 * MARGIN_X) / 2 c.drawCentredString(x, 0, titulo) cursor -= 1.3 * cm c.restoreState() c.saveState() c.translate(0, cursor) c.setFont("Helvetica", 20) c.drawCentredString(x, 0, dataString) cursor -= 1.3 * cm c.restoreState() cursor = SkipLine(cursor, 1) # Sobre o Modelo cursor = InsertSection(c, 0, 'Modelo', cursor) table = [['Modelo:', modelName], ['Número de Atributos:', len(colunas)]] cursor = InsertField(table, c, cursor) c.saveState() c.translate(7 * cm, 12.6 * cm) _ = InsertImage('images/feature_importance.png', c, 11 * cm, 0) c.restoreState() cursor = SkipLine(cursor, n=3) c.showPage() ########## # Page 2 # ########## # About the Dataset c.translate(MARGIN_X, 0) cursor = HEIGHT - MARGIN_Y cursor = InsertSection(c, 0, 'Dataset', cursor) # Unique Values on each column feature table2 = list() for col in uniqueValues.keys(): table2.append([col, uniqueValues[col]]) cursor = InsertField(table2, c, cursor) # Class Balance Plot NumToClass = lambda n: 'DP' if (n == 1) else 'DI' percentage = lambda n, total: (n * 100.0) / total total = sum(ClassBalance.values()) valueInPercentage = [percentage(x, total) for x in ClassBalance.values()] labels = list(map(NumToClass, ClassBalance.keys())) colors = ['#5B9BD5', '#BDD7EE'] plt.pie(valueInPercentage, labels=labels, colors=colors, startangle=120, frame=False, autopct='%.1f %%') centre_circle = plt.Circle((0, 0), 0.5, color='black', fc='white', linewidth=0) fig = plt.gcf() fig.gca().add_artist(centre_circle) plt.title('Balanceamento', fontsize=14) plt.axis('equal') plt.tight_layout() plt.show() fig.savefig('images/class_balance_plot.png', bbox_inches='tight', transparent=True) # Show Class Balance Plot c.saveState() c.translate(8.5 * cm, 18.3 * cm) _ = InsertImage('images/class_balance_plot.png', c, 12 * cm, 0) c.restoreState() # Codes distribution on the dataset c.saveState() c.translate(0 * cm, MARGIN_Y) _ = InsertImage('images/code_frequency.png', c, 17.5 * cm, 0) c.restoreState() c.showPage() ########## # Page 3 # ########## # Metrics - Test c.translate(MARGIN_X, 0) cursor = HEIGHT - MARGIN_Y sectionTitle = 'Métricas - Teste (%.2f %% - %d)' % (percentage( len(X_test), len(X)), len(X_test)) cursor = InsertSection(c, 0, sectionTitle, cursor) # tabela com as métricas table3 = [['Acurácia:', '%.2f %%' % (metrics_test['accuracy'] * 100)], ['MCC:', '%.5f' % (metrics_test['mcc'])], ['Macro-F1:', '%.5f' % (metrics_test['macrof1'])], ['Micro-F1:', '%.5f' % (metrics_test['microf1'])], ['AUC ROC:', '%.5f' % (metrics_test['rocauc'])]] cursor = InsertField(table3, c, cursor) c.saveState() c.translate(6 * cm, 16 * cm) _ = InsertImage('images/confusion_matrix_test.png', c, 12 * cm, 0) c.restoreState() c.saveState() c.translate(0 * cm, MARGIN_Y) _ = InsertImage('images/accuracy_by_code_test.png', c, 17.5 * cm, 0) c.restoreState() c.showPage() ########## # Page 4 # ########## # Metrics - Train c.translate(MARGIN_X, 0) cursor = HEIGHT - MARGIN_Y sectionTitle = 'Métricas - Treino (%.2f %% - %d)' % (percentage( len(X_train), len(X)), len(X_train)) cursor = InsertSection(c, 0, sectionTitle, cursor) table4 = [['Acurácia:', '%.2f %%' % (metrics_train['accuracy'] * 100)], ['MCC:', '%.5f' % (metrics_train['mcc'])], ['Macro-F1:', '%.5f' % (metrics_train['macrof1'])], ['Micro-F1:', '%.5f' % (metrics_train['microf1'])], ['AUC ROC:', '%.5f' % (metrics_train['rocauc'])]] cursor = InsertField(table4, c, cursor) c.saveState() c.translate(6 * cm, 16 * cm) _ = InsertImage('images/confusion_matrix_train.png', c, 12 * cm, 0) c.restoreState() c.saveState() c.translate(0 * cm, MARGIN_Y) _ = InsertImage('images/accuracy_by_code_train.png', c, 17.5 * cm, 0) c.restoreState() c.showPage() c.save()