def PlotCodeFrequency(codes, labels, save=False, path='', filename='img'):
    """
    Plot the frequency of each code on the dataset and also which ones are DI or DP.
    
    - codes (pandas Series, list, numpy array): codes of each sample.
    - labels (pandas Series, list, numpy array): labels of each sample.
    - save (bool): tells if the plot should be saved.
    - path (string): path where to save the figure.
    - filename (string): name of the figure image file to be saved.
    """
    DI = 0
    DP = 1

    df = pd.DataFrame()
    df['labels'] = labels
    df['codes'] = codes

    freq_di = dict()
    freq_dp = dict()

    total = len(df)
    N = 0
    toPercentage = lambda value, total: (value * 100.0) / total
    codigosExistentes = list(df['codes'].unique())
    codigosExistentes.sort()
    for c in codigosExistentes:
        freq_di[c] = toPercentage(
            ((df['codes'] == c) & (df['labels'] == DI)).sum(), total)
        freq_dp[c] = toPercentage(
            ((df['codes'] == c) & (df['labels'] == DP)).sum(), total)
        N += 1

    ind = np.arange(N)
    width = 0.5

    fig = plt.figure(figsize=(11, 5))
    dp_bar = plt.bar(ind, list(freq_dp.values()), width, figure=fig)
    di_bar = plt.bar(ind,
                     list(freq_di.values()),
                     width,
                     bottom=list(freq_dp.values()),
                     figure=fig)

    minorTicks = MultipleLocator(1)

    plt.ylabel('Porcentagem (%)')
    plt.xlabel('Códigos')
    plt.title('Frequência de cada código no dataset')
    plt.xticks(ind, tuple(freq_di.keys()))
    plt.yticks(np.arange(0, 25, 5))
    plt.axes().yaxis.set_minor_locator(minorTicks)
    plt.legend((di_bar[0], dp_bar[0]), ('DI', 'DP'))
    plt.grid(True, which='both', axis='y')

    plt.show()

    if (save):
        util.CheckAndCreatePath(path)
        util.SaveFigure(fig, path, filename)
예제 #2
0
def PlotAccuracyOfEachEventCode(codes,
                                labels,
                                prediction,
                                save=False,
                                path='',
                                filename='img'):
    """
    Plots the model accuracy for each event code.
    
    - codes (pandas Series, list, numpy array): codes of each sample.
    - labels (pandas Series, list, numpy array): labels of each sample. 
    - prediction (pandas Series, list, numpy array): predictions of each sample.
    - save (bool): tells if the plot should be saved.
    - path (string): path where to save the figure. e.g.: 'images/'
    - filename (string): name of the figure image file to be saved.
    """

    df = pd.DataFrame()
    df['labels'] = labels
    df['codes'] = codes
    df['prediction'] = prediction

    cods = df['codes'].unique()
    cods = np.sort(cods)
    cods = cods[np.invert(np.isnan(cods))]
    cods = cods.astype(int)

    toPercentage = lambda value, total: (value * 100.0) / total
    accuracy = list()
    for cod in cods:
        right = ((df['prediction'] == df['labels']) &
                 (df['codes'] == cod)).sum()
        total = (df['codes'] == cod).sum()
        percentage = toPercentage(right, total)
        accuracy.append(percentage)

    # Plotar graficos
    fig, ax = plt.subplots(figsize=(11, 5))
    xticks = list(range(0, len(cods)))
    yticks = list(range(0, 101, 10))

    # show the figure, but do not block

    plt.bar(xticks, accuracy, figure=fig, align='center', width=0.3)
    ax.tick_params(axis='y', gridOn=True)
    ax.set_xticks(xticks)
    ax.set_xticklabels(cods)
    ax.set_yticks(yticks)
    ax.set_ylim([0, 100])
    ax.set_ylabel('Acurácia (%)')
    ax.set_xlabel('Codigo')
    ax.set_title('Acurácia do modelo para cada código')

    plt.show(block=False)

    if (save):
        util.CheckAndCreatePath(path)
        util.SaveFigure(fig, path, filename)
예제 #3
0
def OneHotEncode(df, columns = None, save = False, load = False, path = ''):
    """
    Performs One-Hot encoding on specified columns or in all of them.
    
    - df (pandas dataframe): dataframe containing the columns to one-hot encode
    - columns (list of strings/numbers) : list of columns names to one-hot encode
    - save (bool): bool that tells if the encoder must be saved.
    - load (bool): bool that  tells if the encoder must be loaded.
    - path (string): path where the encoder must be saved at or loaded from.
    
    Return (pandas dataframe): dataframe with specified columns label encoded.
    """
    if isinstance(df,pd.Series):
        df = df.to_frame()
        
    if columns is None:
        columns = list(df.columns) 
        
    # if directory doesnt exist, create it
    util.CheckAndCreatePath(path)   
        
    # Perform Label encode on the columns
    df_enc, labelEncoders = LabelEncode(df, columns = columns, save = save, load = load, path = path, returnEncoders=True)
    
    # Perform One-hot enconde on the columns
    cont = 0            
    for column in columns:
        if load:
            encoder = pickle.load(open(path + "one_hot_encoder_" + 
                                      str(column) + ".pickle.dat", "rb"))
        else:
            encoder = preprocessing.OneHotEncoder()
                    
        oneHotEncoded = encoder.fit_transform(df_enc[column].to_frame())
        
        # Create datafame with the one-hot encoded features
        columns_transformed = [str(column)+'_'+str(i) for i in labelEncoders[cont].classes_]
        oneHotEncoded = pd.DataFrame(oneHotEncoded.toarray(), columns=columns_transformed)
        
        # Put one-hot columns on the right position of the dataset and delete old feaure
        df_enc = util.InsertColumnsOnADataframePosition(df_enc, oneHotEncoded, list(df_enc.columns).index(column))
        df_enc = df_enc.drop(columns=[column])
        
        # Save encoder
        if save:
            pickle.dump(encoder, open(path + "one_hot_encoder_" + 
                                      str(column) + ".pickle.dat", "wb"))             
        cont += 1
        
    return df_enc
예제 #4
0
def PlotFeatureImportanceXGBoost(model, save=False, path='', filename='img'):
    """
    Plots the importance of each feature from a XGBoost model.
    
    - model (XGBoost model): model.
    - save (bool): tells if the plot should be saved.
    - path (string): path where to save the figure.
    - filename (string): name of the figure image file to be saved.
    """
    fig, ax = plt.subplots(figsize=(6, 7))
    xgb.plot_importance(model, ax=ax)
    plt.show()

    if (save):
        util.CheckAndCreatePath(path)
        util.SaveFigure(fig, path, filename)
예제 #5
0
def LabelEncode(df, columns = None, save = False, load = False, path = '', returnEncoders = False):
    """
    Performs label encoding on specified columns or in all of them.
    
    - df (pandas dataframe/ pandas Series): dataframe containing the columns to 
    label encode, or series with only one column.
    - columns (list of strings/numbers) : list of columns names to label encode
    - save (bool): tells if the encoder must be saved.
    - load (bool): tells if the encoder must be loaded.
    - path (string): path where the encoder must be saved at or loaded from.
    - returnEncoders (bool): tells if the encoder should be returned too.
    
    Return (pandas dataframe, list of LabelEncoders): dataset with specified 
    columns label encoded and the Label Encoder itself in case returnEncoder is 
    True.
    """
    if isinstance(df,pd.Series):
        df = df.to_frame()
        
    if columns is None:
        columns = list(df.columns)  

    # if directory doesnt exist, create it
    util.CheckAndCreatePath(path)
    
    encoders = list()
    for column in columns:
        # Load or create Label Encoder
        if load:
            encoder = pickle.load(open(path + "label_encoder_" + 
                                      str(column) + ".pickle.dat", "rb"))
        else:
            encoder = preprocessing.LabelEncoder()
        
        df[column] = encoder.fit_transform(df[column])            
        encoders.append(encoder)
        
        # Save encoder
        if save:
            pickle.dump(encoder, open(path + "label_encoder_" + 
                                      str(column) + ".pickle.dat", "wb")) 
            
    if returnEncoders:
        return df, encoders
    
    return df
예제 #6
0
def ApplyPCA(X, columns, save=False, load=False, path = '', n_components = 9):
    '''
    Apply PCA on specific columns on the dataset
    
    - X (pd.DataFrame): dataset with the columns where PCA should be applied.
    - columns (list of numbers): List of number Ids of columns to apply the PCA.
    - save (bool): bool that tells if the PCA transoformer should be saved.
    - load (bool): bool that tells if the PCA transoformer should be loaded.
    - path (string): path where the Vectorizer must be saved at or loaded from.
    - n_components (int): number of PCA components.
    
    Return X_train(pd.DataFrame): X_train with the transformed features.
    '''
    
    # if directory doesnt exist, create it
    util.CheckAndCreatePath(path)            
    
    # Apply PCA
    if load:
        pca = pickle.load(open(path + "pca_transformer.pickle.dat", "rb"))
    else:
        pca = PCA(n_components=n_components)
    
    selectedColumns = X.columns[columns]    
    pcaFeatures = pca.fit_transform(X[selectedColumns])
    columnsNames = list(range(pcaFeatures.shape[1]))
    pcaFeatures = pd.DataFrame(pcaFeatures, columns=columnsNames)    
        
#    X_transformed = pd.concat([pcaFeatures, X.drop(selectedColumns, axis=1)], axis=1)
    X_transformed = util.ConcatenateDataframes(pcaFeatures, X.drop(selectedColumns, axis=1))
        
    # Save PCA transformer
    if save:
        pickle.dump(pca, open(path + "pca_transformer.pickle.dat", "wb"))
        
    return X_transformed
예제 #7
0
def PlotConfusionMatrix(cm,
                        cm_confidence_interval,
                        classes,
                        normalize=False,
                        title='Matriz de Confusão',
                        cmap=plt.cm.Blues,
                        save=False,
                        path='',
                        imgname='matriz_de_confusão'):
    """Gera e mostra a matriz de confusão.

    Parameters
    ----------
    cm : numpy.array
        Matriz de confusão.
    classes : list
        Lista com o nome de cada classe dos rótulos. Exemplo: ['DI', 'DP'].
    normalize : bool, default False
        Se verdadeiro, a matriz de confusão será normalizada.
    title : string, default 'Matriz de Confusão'
        Título da imagem da matriz de confusão.
    cmap : matplotlib.pyplot.cm, default matplotlib.pyplot.cm.Blues
        Colormap usado na matriz.
    save : bool, default False
        Se verdadeiro, a imagem da matriz de confusão será salva.
    path : str
        Diretório onde a imagem da matriz de confusão será salva.
    imgname : str, default 'img'
        Nome da imagem da matriz de confusão que será salva.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Matriz de Confusão Normalizada")
    else:
        print('Matriz de Confusão, sem normalização')

    # formatted confusion matrix
    cm_format = np.copy(cm).astype(str)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            if normalize:
                if cm_confidence_interval is not None:
                    cm_format[
                        i,
                        j] = f'{cm[i,j]:.2f} ' + u"\u00B1" + f' {cm_confidence_interval[i,j]:.4f}'
                else:
                    cm_format[i, j] = f'{cm[i,j]:.2f}'
            else:
                cm_format[i, j] = f'{cm[i,j]:.0f}'

    fig, ax = plt.subplots(figsize=(5, 5))
    im = ax.imshow(cm, aspect='equal', interpolation='nearest', cmap=cmap)
    plt.title(title)
    cbar = ax.figure.colorbar(im, ax=ax, shrink=0.7)
    tick_marks = np.arange(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_yticks(tick_marks)

    ax.set_xticklabels(classes)
    ax.set_yticklabels(classes)

    ax.set_xticks(np.arange(cm.shape[1] + 1) - .5, minor=True)
    ax.set_yticks(np.arange(cm.shape[0] + 1) - .5, minor=True)
    ax.tick_params(axis='x', rotation=45)
    ax.tick_params(axis='both', labelsize=12)
    ax.set_xlabel('Classe Prevista', size=12)
    ax.set_ylabel('Classe Verdadeira', size=12)
    ax.titlesize = 13

    thresh = (cm.max() + cm.min()) / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j,
                     i,
                     cm_format[i, j],
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black",
                     fontsize=12)

    plt.tight_layout()
    ax.labelsize = 12
    plt.grid(False)
    plt.show(block=False)

    if (save):
        util.CheckAndCreatePath(path)
        util.SaveFigure(fig, path, imgname)
예제 #8
0
def ColumnAsBagOfWords(column, regex = None, save = False, load = False, 
                       applyTFIDF = True, path = '', **kwargs):
    """
    Encode text column as bag of words.
    
    - colum (pd.Series): vector containing the text data.
    - regex (raw string): string with the tokenizer pattern. If not passed, a
    pattern where everything but "_", "." and " " is accepted as word is assumed
    - save (bool): bool that tells if the Vectorizer must be saved.
    - load (bool): bool that  tells if the Vectorizer must be loaded.
    - applyTDIDF (bool): bool that  tells if td-idf (term frequency–inverse document frequency)
    should be applied
    - path (string): path where the Vectorizer must be saved at or loaded from.
    - kwags(dict): dictionary of CountVectorizer/TfidfTransformer parameters to set.
    
    Return (pd.Dataframe, CountVectorizer, TfidfTransformer): Resulting dataframe,
    Count Vectorizer (scikit-learn), Tfidf Transformer (scikit-learn)
    """
    # if directory doesnt exist, create it
    util.CheckAndCreatePath(path)  
        
    if regex is None:
        regex = r"[^_^.^-]+"
    
    ColumnName = column.name
    
    # Apply bag of words vectorization
    if load:
        count_vect = pickle.load(open(path + "count_vectorizer_" + 
                                      str(ColumnName) + ".pickle.dat", "rb"))
    else:
        count_vect = CountVectorizer(token_pattern = regex)
    
    # Capture count vectorizer parameters from kwags
    try:
        count_vect.set_params(**kwargs)
    except:
        pass
        
    columTransformed = count_vect.fit_transform(column)
    
    # Apply tf-idf(term frequency–inverse document frequency)
    if applyTFIDF:
        if load:
            tf_idf = pickle.load(open(path + "tf_idf_" + 
                                      str(ColumnName) + ".pickle.dat", "rb"))
        else:
            tf_idf = TfidfTransformer(norm='l1', use_idf=True)
    
        # Capture Tfidf Transformer parameters from kwags
        try:
            tf_idf.set_params(**kwargs)
        except:
            pass
    
        columTransformed = tf_idf.fit_transform(columTransformed)
    else:
        tf_idf = None

    # create vector of column names    
    columnNames = [(str(ColumnName) + '-' + str(i)) for i in count_vect.vocabulary_.keys()]
    
    # Save vectorizer and tf-idf transformer
    if save:
        pickle.dump(count_vect, open(path + "count_vectorizer_" + 
                                  str(ColumnName) + ".pickle.dat", "wb"))
        if applyTFIDF:
            pickle.dump(tf_idf, open(path + "tf_idf_" + str(ColumnName) + 
                                     ".pickle.dat", "wb"))
    
    # Construct final dataframe of transformed column
    df = pd.DataFrame(columTransformed.toarray(), columns=columnNames)
    
    return df, count_vect, tf_idf
예제 #9
0
def Generate(model,
             modelName,
             codes,
             X_train,
             Y_train,
             X_test,
             Y_test,
             path='Reports/'):
    # Generate needed variables
    X = pd.concat([X_train, X_test])
    Y = pd.concat([Y_train, Y_test])
    predictions_test = model.predict(X_test)
    predictions_train = model.predict(X_train)

    data = datetime.now()
    nomeArquivo = 'relatorio' + data.strftime('_%d_%m_%Y-%H_%M_%S') + '.pdf'
    titulo = 'Relatório do Modelo'
    dataString = data.strftime('%d/%m/%Y %H:%M:%S')

    colunas = list(X_train.columns)
    ModelEvaluation.PlotFeatureImportanceXGBoost(model,
                                                 save=True,
                                                 path='images',
                                                 filename='feature_importance')

    uniqueValues = EDA.UniqueValuesOnEachColumn(X)
    ClassBalance = EDA.CalculateClassBalance(Y.to_frame(), get=True)
    EDA.PlotCodeFrequency(codes,
                          Y,
                          save=True,
                          path='images',
                          filename='code_frequency')

    metrics_test = ModelEvaluation.EvaluateClassification(
        Y_test,
        predictions_test,
        'classifier_scores_v3',
        save=True,
        path='images',
        imgname='confusion_matrix_test')
    ModelEvaluation.PlotAccuracyOfEachEventCode(
        codes[len(Y_train):],
        Y_test,
        predictions_test,
        save=True,
        path='images',
        filename='accuracy_by_code_test')

    metrics_train = ModelEvaluation.EvaluateClassification(
        Y_train,
        predictions_train,
        'classifier_scores_v3',
        save=True,
        path='images',
        imgname='confusion_matrix_train')
    ModelEvaluation.PlotAccuracyOfEachEventCode(
        codes[:len(Y_train)],
        Y_train,
        predictions_train,
        save=True,
        path='images',
        filename='accuracy_by_code_train')

    # Create document

    # if directory doesnt exist, create it
    util.CheckAndCreatePath(path)

    c = canvas.Canvas(path + nomeArquivo, pagesize=A4)
    global WIDTH
    global HEIGHT
    global MARGIN_Y
    global MARGIN_X
    global LIN_HEIGHT

    WIDTH, HEIGHT = A4
    MARGIN_Y = 2.54 * cm
    MARGIN_X = 1.5 * cm
    LIN_HEIGHT = 0.8 * cm

    c.translate(MARGIN_X, 0)
    cursor = HEIGHT - MARGIN_Y

    ##########
    # Page 1 #
    ##########

    # Title
    c.saveState()
    c.translate(0, cursor)
    c.setFont("Helvetica", 20)
    c.setFillColorRGB(0, 0, 0)

    x = (WIDTH - 2 * MARGIN_X) / 2
    c.drawCentredString(x, 0, titulo)
    cursor -= 1.3 * cm
    c.restoreState()

    c.saveState()
    c.translate(0, cursor)
    c.setFont("Helvetica", 20)
    c.drawCentredString(x, 0, dataString)
    cursor -= 1.3 * cm
    c.restoreState()

    cursor = SkipLine(cursor, 1)

    # Sobre o Modelo
    cursor = InsertSection(c, 0, 'Modelo', cursor)

    table = [['Modelo:', modelName], ['Número de Atributos:', len(colunas)]]
    cursor = InsertField(table, c, cursor)

    c.saveState()
    c.translate(7 * cm, 12.6 * cm)
    _ = InsertImage('images/feature_importance.png', c, 11 * cm, 0)
    c.restoreState()

    cursor = SkipLine(cursor, n=3)
    c.showPage()

    ##########
    # Page 2 #
    ##########

    # About the Dataset
    c.translate(MARGIN_X, 0)
    cursor = HEIGHT - MARGIN_Y
    cursor = InsertSection(c, 0, 'Dataset', cursor)

    # Unique Values on each column feature
    table2 = list()
    for col in uniqueValues.keys():
        table2.append([col, uniqueValues[col]])
    cursor = InsertField(table2, c, cursor)

    # Class Balance Plot
    NumToClass = lambda n: 'DP' if (n == 1) else 'DI'
    percentage = lambda n, total: (n * 100.0) / total

    total = sum(ClassBalance.values())
    valueInPercentage = [percentage(x, total) for x in ClassBalance.values()]

    labels = list(map(NumToClass, ClassBalance.keys()))
    colors = ['#5B9BD5', '#BDD7EE']

    plt.pie(valueInPercentage,
            labels=labels,
            colors=colors,
            startangle=120,
            frame=False,
            autopct='%.1f %%')
    centre_circle = plt.Circle((0, 0),
                               0.5,
                               color='black',
                               fc='white',
                               linewidth=0)
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    plt.title('Balanceamento', fontsize=14)
    plt.axis('equal')
    plt.tight_layout()
    plt.show()

    fig.savefig('images/class_balance_plot.png',
                bbox_inches='tight',
                transparent=True)

    # Show Class Balance Plot
    c.saveState()
    c.translate(8.5 * cm, 18.3 * cm)
    _ = InsertImage('images/class_balance_plot.png', c, 12 * cm, 0)
    c.restoreState()

    # Codes distribution on the dataset
    c.saveState()
    c.translate(0 * cm, MARGIN_Y)
    _ = InsertImage('images/code_frequency.png', c, 17.5 * cm, 0)
    c.restoreState()

    c.showPage()

    ##########
    # Page 3 #
    ##########

    # Metrics - Test
    c.translate(MARGIN_X, 0)
    cursor = HEIGHT - MARGIN_Y
    sectionTitle = 'Métricas - Teste (%.2f %% - %d)' % (percentage(
        len(X_test), len(X)), len(X_test))
    cursor = InsertSection(c, 0, sectionTitle, cursor)

    # tabela com as métricas
    table3 = [['Acurácia:',
               '%.2f %%' % (metrics_test['accuracy'] * 100)],
              ['MCC:', '%.5f' % (metrics_test['mcc'])],
              ['Macro-F1:', '%.5f' % (metrics_test['macrof1'])],
              ['Micro-F1:', '%.5f' % (metrics_test['microf1'])],
              ['AUC ROC:', '%.5f' % (metrics_test['rocauc'])]]
    cursor = InsertField(table3, c, cursor)

    c.saveState()
    c.translate(6 * cm, 16 * cm)
    _ = InsertImage('images/confusion_matrix_test.png', c, 12 * cm, 0)
    c.restoreState()

    c.saveState()
    c.translate(0 * cm, MARGIN_Y)
    _ = InsertImage('images/accuracy_by_code_test.png', c, 17.5 * cm, 0)
    c.restoreState()

    c.showPage()

    ##########
    # Page 4 #
    ##########

    # Metrics - Train
    c.translate(MARGIN_X, 0)
    cursor = HEIGHT - MARGIN_Y
    sectionTitle = 'Métricas - Treino (%.2f %% - %d)' % (percentage(
        len(X_train), len(X)), len(X_train))
    cursor = InsertSection(c, 0, sectionTitle, cursor)

    table4 = [['Acurácia:',
               '%.2f %%' % (metrics_train['accuracy'] * 100)],
              ['MCC:', '%.5f' % (metrics_train['mcc'])],
              ['Macro-F1:', '%.5f' % (metrics_train['macrof1'])],
              ['Micro-F1:', '%.5f' % (metrics_train['microf1'])],
              ['AUC ROC:', '%.5f' % (metrics_train['rocauc'])]]
    cursor = InsertField(table4, c, cursor)

    c.saveState()
    c.translate(6 * cm, 16 * cm)
    _ = InsertImage('images/confusion_matrix_train.png', c, 12 * cm, 0)
    c.restoreState()

    c.saveState()
    c.translate(0 * cm, MARGIN_Y)
    _ = InsertImage('images/accuracy_by_code_train.png', c, 17.5 * cm, 0)
    c.restoreState()

    c.showPage()
    c.save()