def test_pandas_confusion_normalized():
    y_true = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2]
    y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2]
    cm = ConfusionMatrix(y_true, y_pred)
    assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix)

    df = cm.to_dataframe()
    df_norm = cm.to_dataframe(normalized=True)
    assert (df_norm.sum(axis=1).sum() == len(df))
    def test_pandas_confusion_normalized(self):
        y_true = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2]
        y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2]
        cm = ConfusionMatrix(y_true, y_pred)
        assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix)

        df = cm.to_dataframe()
        df_norm = cm.to_dataframe(normalized=True)
        assert(df_norm.sum(axis=1).sum() == len(df))
def calculate_accuracy(csv_filename):

    # Loading csv information into a data frame
    data = pd.read_csv(csv_filename)
    # assigning actual sentiment data to y_test
    y_test = data['Actual_Statement']
    # assigning predicted sentiment data to y_pred
    y_pred = data['Prediction']

    score = accuracy_score(y_test, y_pred)
    # calling accuracy_score method to get the accuracy_score
    print 'Accuracy Score : ', score

    # calling confusion_matrix method from pandas_ml to show the output
    confusion_matrix = ConfusionMatrix(y_test, y_pred)
    output = confusion_matrix.to_dataframe()

    writer = pd.ExcelWriter("azure_text_confusion_matrix_output.xlsx")
    output.to_excel(writer, startrow=4, startcol=0)
    Acuracy_Score = 'Accuracy Score : ' + str(score)
    worksheet = writer.sheets['Sheet1']
    worksheet.write(1, 0, Acuracy_Score)

    writer.save()

    print("Confusion matrix:\n%s" % confusion_matrix)
    def test_pandas_confusion_normalized_issue1(self):
        # should insure issue 1 is fixed
        # see http://stackoverflow.com/questions/19233771/sklearn-plot-confusion-matrix-with-labels/31720054#31720054

        y_true = ['business', 'business', 'business', 'business', 'business',
                  'business', 'business', 'business', 'business', 'business',
                  'business', 'business', 'business', 'business', 'business',
                  'business', 'business', 'business', 'business', 'business']

        y_pred = ['health', 'business', 'business', 'business', 'business',
                  'business', 'health', 'health', 'business', 'business', 'business',
                  'business', 'business', 'business', 'business', 'business',
                  'health', 'health', 'business', 'health']

        cm = ConfusionMatrix(y_true, y_pred)
        assert isinstance(cm, pdml.confusion_matrix.BinaryConfusionMatrix)

        df = cm.to_dataframe()
        df_norm = cm.to_dataframe(normalized=True)
        assert(df_norm.sum(axis=1, skipna=False).fillna(1).sum() == len(df))
    def test_pandas_confusion_normalized_issue1(self):
        # should insure issue 1 is fixed
        # see http://stackoverflow.com/questions/19233771/sklearn-plot-confusion-matrix-with-labels/31720054#31720054

        y_true = ['business', 'business', 'business', 'business', 'business',
                  'business', 'business', 'business', 'business', 'business',
                  'business', 'business', 'business', 'business', 'business',
                  'business', 'business', 'business', 'business', 'business']

        y_pred = ['health', 'business', 'business', 'business', 'business',
                  'business', 'health', 'health', 'business', 'business', 'business',
                  'business', 'business', 'business', 'business', 'business',
                  'health', 'health', 'business', 'health']

        cm = ConfusionMatrix(y_true, y_pred)
        assert isinstance(cm, pdml.confusion_matrix.BinaryConfusionMatrix)

        df = cm.to_dataframe()
        df_norm = cm.to_dataframe(normalized=True)
        assert(df_norm.sum(axis=1, skipna=False).fillna(1).sum() == len(df))
  def on_epoch_end(self, epoch, logs=None):
    y_true, y_pred = [], []
    for i in range(self.validation_steps):
      X_batch, y_true_batch = next(self.validation_data)
      y_pred_batch = self.model.predict(X_batch)

      y_true.extend(y_true_batch)
      y_pred.extend(y_pred_batch)

    y_true = np.float32(y_true)
    y_pred = np.float32(y_pred)
    val_loss = log_loss(y_true, y_pred)
    # map integer labels to strings
    y_true = list(y_true.argmax(axis=-1))
    y_pred = list(y_pred.argmax(axis=-1))
    y_true = [self.int2label[y] for y in y_true]
    y_pred = [self.int2label[y] for y in y_pred]
    confusion = ConfusionMatrix(y_true, y_pred)
    accs = self.accuracies(confusion._df_confusion.values)
    acc = self.accuracy(confusion._df_confusion.values)
    # same for wanted words
    y_true = [y if y in self.wanted_words else '_unknown_' for y in y_true]
    y_pred = [y if y in self.wanted_words else '_unknown_' for y in y_pred]
    wanted_words_confusion = ConfusionMatrix(y_true, y_pred)
    wanted_accs = self.accuracies(wanted_words_confusion._df_confusion.values)
    acc_line = ('\n[%03d]: val_categorical_accuracy: %.2f, '
                'val_mean_categorical_accuracy_wanted: %.2f') % (
                    epoch, acc, wanted_accs.mean())  # noqa
    with open('confusion_matrix.txt', 'a') as f:
      f.write('%s\n' % acc_line)
      f.write(confusion.to_dataframe().to_string())

    with open('wanted_confusion_matrix.txt', 'a') as f:
      f.write('%s\n' % acc_line)
      f.write(wanted_words_confusion.to_dataframe().to_string())

    logs['val_loss'] = val_loss
    logs['val_categorical_accuracy'] = acc
    logs['val_mean_categorical_accuracy_all'] = accs.mean()
    logs['val_mean_categorical_accuracy_wanted'] = wanted_accs.mean()
Beispiel #7
0
  def on_epoch_end(self, epoch, logs=None):
    y_true, y_pred = [], []
    for i in range(self.validation_steps):
      X_batch, y_true_batch = next(self.validation_data)
      y_pred_batch = self.model.predict(X_batch)

      y_true.extend(y_true_batch)
      y_pred.extend(y_pred_batch)

    y_true = np.float32(y_true)
    y_pred = np.float32(y_pred)
    val_loss = log_loss(y_true, y_pred)
    # map integer labels to strings
    y_true = list(y_true.argmax(axis=-1))
    y_pred = list(y_pred.argmax(axis=-1))
    y_true = [self.int2label[y] for y in y_true]
    y_pred = [self.int2label[y] for y in y_pred]
    confusion = ConfusionMatrix(y_true, y_pred)
    accs = self.accuracies(confusion._df_confusion.values)
    acc = self.accuracy(confusion._df_confusion.values)
    # same for wanted words
    y_true = [y if y in self.wanted_words else '_unknown_' for y in y_true]
    y_pred = [y if y in self.wanted_words else '_unknown_' for y in y_pred]
    wanted_words_confusion = ConfusionMatrix(y_true, y_pred)
    wanted_accs = self.accuracies(wanted_words_confusion._df_confusion.values)
    acc_line = ('\n[%03d]: val_categorical_accuracy: %.2f, '
                'val_mean_categorical_accuracy_wanted: %.2f') % (
                    epoch, acc, wanted_accs.mean())  # noqa
    with open('confusion_matrix.txt', 'a') as f:
      f.write('%s\n' % acc_line)
      f.write(confusion.to_dataframe().to_string())

    with open('wanted_confusion_matrix.txt', 'a') as f:
      f.write('%s\n' % acc_line)
      f.write(wanted_words_confusion.to_dataframe().to_string())

    logs['val_loss'] = val_loss
    logs['val_categorical_accuracy'] = acc
    logs['val_mean_categorical_accuracy_all'] = accs.mean()
    logs['val_mean_categorical_accuracy_wanted'] = wanted_accs.mean()
Beispiel #8
0
def gen_conf_mat(y_pred, y_true):
    """Generate confusion matrix with the appropriate naming conventions"""
    #ipdb.set_trace()
    inv_label_dict = {
        0: 'background',
        63: 'liver',
        126: 'l_kidney',
        189: 'r_kidney',
        252: 'spleen'
    }
    #Rename columne for analysis
    tmp_conf_mat = ConfusionMatrix(y_true, y_pred)
    tmp_conf_mat = tmp_conf_mat.to_dataframe()
    filt_df_dict = {
        k: v
        for k, v in inv_label_dict.items()
        if k in tmp_conf_mat.columns.tolist()
    }
    tmp_conf_mat.rename(filt_df_dict, axis=0, inplace=True)
    tmp_conf_mat.rename(filt_df_dict, axis=1, inplace=True)

    return tmp_conf_mat
Beispiel #9
0
def gen_test_report(clf, y_test, X_test, args, sub_str='_test_report_per_cls'):
    #Writing test report to file
    y_true, y_pred = y_test, clf.predict(X_test)
    #ipdb.set_trace()
    #Producing pandas ML confusion matrix and statistical summary
    tmp_confusion_matrix = ConfusionMatrix(y_true, y_pred)
    #tmp_stat_summary=tmp_confusion_matrix.stats()
    tmp_confusion_matrix = tmp_confusion_matrix.to_dataframe()
    tmp_confusion_matrix.to_csv(
        os.path.join(args.output_model_dir,
                     args.f_nm_str + '_confusion_matrix'))
    #Generation dictionary for analysis
    #with open(os.path.join(args.output_model_dir,args.f_nm_str+'_descriptive_stat.pickle')) as fb:
    #   pickle.dump(tmp_stat_summary,fb)

    file_nm_test_report = args.f_nm_str + sub_str
    #Generating report on test data for analysis
    test_report_raw = classification_report(y_true, y_pred, output_dict=True)
    test_report_df = pd.DataFrame(test_report_raw).transpose()
    #Writing best model to file directory for models
    print(test_report_raw)
    test_report_df.to_csv(
        os.path.join(args.output_model_dir, file_nm_test_report))
Beispiel #10
0
df = pd.read_csv('./data/analysis.csv')

for difficulty in [0,1,2]:
	sub_df = df.loc[df['difficulty']==difficulty]


	cmat = ConfusionMatrix(sub_df['predicted'],sub_df['actual'])
	#cmat.print_stats()


	#print dir(cmat)

	fig = plt.figure()
	ax = fig.add_subplot(111)

	sns.heatmap(cmat.to_dataframe(),square=True, annot=True, fmt="d", cmap=plt.cm.bone_r)
	plt.xticks(rotation=90)
	plt.yticks(rotation=0)


	a=[item.get_text() for item in ax.get_yticklabels()]
	b=[item.get_text() for item in ax.get_xticklabels()]
	conversion = {'sympathomimetic': "Sympathomimetic",
					'sedative_hypnotic': "Sedative-Hypnotic",
					'cholinergic':"Cholinergic",
					'anticholinergic':"Anticholinergic",
					'opioid':"Opioid",
					"serotonin_syndrome":"Serotonin Syndrome"}

	new_ticklabels = [conversion[label] for label in b]
	ax.set_yticklabels(new_ticklabels)
Beispiel #11
0
import seaborn as sns
import matplotlib.pyplot as plt

from pandas_ml import ConfusionMatrix

df = pd.read_csv('./data/analysis.csv')

cmat = ConfusionMatrix(df['predicted'], df['actual'])
#cmat.print_stats()

#print dir(cmat)

fig = plt.figure()
ax = fig.add_subplot(111)

sns.heatmap(cmat.to_dataframe(),
            square=True,
            annot=True,
            fmt="d",
            cmap=plt.cm.bone_r)
plt.xticks(rotation=90)
plt.yticks(rotation=0)

a = [item.get_text() for item in ax.get_yticklabels()]
b = [item.get_text() for item in ax.get_xticklabels()]
conversion = {
    'sympathomimetic': "Sympathomimetic",
    'sedative_hypnotic': "Sedative-Hypnotic",
    'cholinergic': "Cholinergic",
    'anticholinergic': "Anticholinergic",
    'opioid': "Opioid",
Beispiel #12
0
    df['actual_class'], r_bins = pd.qcut(df['rome'],
                                         10,
                                         labels=list(range(1, 11)),
                                         retbins=True)
    df['predic_class'] = pd.qcut(df['predicted'],
                                 10,
                                 labels=list(range(1, 11)))
    # df['predic_class'] = pd.cut(df['predicted'], bins=r_bins, labels=list(range(1, 11)))

    cm = ConfusionMatrix(df['actual_class'].to_list(),
                         df['predic_class'].to_list())
    cm.print_stats()
    statdict = cm.stats()
    cm_stats = statdict['class']

    matrix = cm.to_dataframe()
    matrix.index.rename('ROME decile', inplace=True)
    matrix.columns.rename('ROME$_\mathrm{NN}$ decile', inplace=True)

    plt.close()
    # sns.heatmap(matrix / (len(predicted)//10) , cmap='Greys', annot=matrix, fmt='d')
    ax = sns.heatmap(matrix / (len(predicted) // 10),
                     annot=matrix,
                     fmt='d',
                     cmap='gray_r',
                     vmin=0.0,
                     vmax=0.56)
    cbar = ax.collections[0].colorbar
    cbar.ax.tick_params(labelsize=20)
    cbar.set_label("Percentage in decile [1]", fontsize=23)
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=20)
Beispiel #13
0
class EncoderWithClassifier:
    def __init__(self,
                 encoder_model,
                 name='',
                 activation='sigmoid',
                 loss_function='categorical_crossentropy',
                 optimizer=SGD(lr=0.01),
                 use_last_dim_as_classifier_dim=True,
                 classifier_dim=None):
        self.__encoder_model = encoder_model
        self.__name = name
        self.__activation = activation
        self.__loss_function = loss_function
        self.__optimizer = optimizer
        self.__trained = True
        self.__validated = True
        self.__use_last_dim_as_classifier_dim = use_last_dim_as_classifier_dim
        self.__classifier_dim = classifier_dim
        self.__validateDimensions()
        self.__generateClassifier()
        self.__compile()

    def __validateDimensions(self):
        if not self.__use_last_dim_as_classifier_dim and self.__classifier_dim <= 0:
            raise ValueError(
                "The number of neurons in a layer (classifier_dim) must be greater than zero"
            )

    def __generateClassifier(self):
        ae_output = self.__encoder_model.layers[-1].output

        dim = None

        if not self.__use_last_dim_as_classifier_dim:
            dim = self.__classifier_dim
        else:
            dim = self.__encoder_model.layers[-1].units

        self.__classifier_layers = Dense(dim,
                                         activation=self.__activation,
                                         name='classifier')(ae_output)
        self.__classifier = Model(inputs=[self.__encoder_model.input],
                                  outputs=[self.__classifier_layers])

    def __compile(self):
        self.__classifier.compile(loss=self.__loss_function,
                                  optimizer=self.__optimizer,
                                  metrics=['acc'])
        self.__summary = self.__classifier.summary()

    def __stats(self, path=None):

        file_pattern = path + self.__name + '.{0}.{1}'

        classifier_predictions_max = np.argmax(self.__classifier_predictions,
                                               axis=1)
        Ymax = np.argmax(self.__eval_label, axis=1)

        self.__confusion_matrix = ConfusionMatrix(Ymax,
                                                  classifier_predictions_max)

        self.__status_dump(file_pattern,
                           self.__confusion_matrix,
                           html=True,
                           string=True,
                           pickle=True,
                           stats_as_txt=True,
                           latex=True)

    def __status_dump(self,
                      file_pattern,
                      confusion_matrix,
                      html=False,
                      string=False,
                      pickle=False,
                      stats_as_txt=False,
                      latex=False):
        dataframe = self.__confusion_matrix.to_dataframe()

        if html:
            with open(file_pattern.format('confusion_matrix', 'html'),
                      'w') as file:
                file.write(dataframe.to_html())
        if string:
            with open(file_pattern.format('confusion_matrix', 'txt'),
                      'w') as file:
                file.write(dataframe.to_string())
        if pickle:
            dataframe.to_pickle(
                file_pattern.format('confusion_matrix', 'pickle'))

        if stats_as_txt:
            with open(file_pattern.format('stats', '.txt'), 'w') as file:
                file.write(str(confusion_matrix.stats()))
        if latex:
            with open(file_pattern.format('confusion_matrix', '.latex_table'),
                      'w') as file:
                file.write(dataframe.to_latex())

    def eval(self, feature=None, label=None):
        self.__eval_feature = feature
        self.__eval_label = label
        self.__classifier_predictions = self.__classifier.predict(feature)

    def train(self,
              feature=None,
              label=None,
              validation=None,
              epochs=None,
              batch_size=None,
              shuffle=True,
              store_history=True,
              early_stopping=None,
              save_every=1,
              callbacks=None):

        h = self.__classifier.fit(x=feature,
                                  y=label,
                                  validation_data=validation,
                                  batch_size=batch_size,
                                  epochs=epochs,
                                  shuffle=shuffle,
                                  callbacks=callbacks)

        if store_history:
            self.__history = h

    def eval_stats(self, reportpath):
        self.__stats(path=reportpath)

    @property
    def classifier(self):
        return self.__classifier

    @property
    def summary(self):
        return self.__summary
Beispiel #14
0
class Autoencoder:
    def __init__(self,
                 encoder_layers,
                 name='',
                 hidden_layer_activation='relu',
                 output_layer_activation='relu',
                 loss_function='mse',
                 optimizer=SGD(lr=0.01),
                 discard_decoder_model=False):

        self.__name = name
        self.__encoder_layers_config = encoder_layers
        self.__discard_decoder_model = discard_decoder_model

        self.__hidden_layers_activation = hidden_layer_activation
        self.__output_layer_activation = output_layer_activation
        self.__loss_function = loss_function
        self.__optimizer = optimizer
        self.__trained = False
        self.__validated = False
        self.__generateModels()
        self.__compile()

    def __validateEncoderLayers(self):
        """ validate the encoder layers configured 
			raise value errors if __encoder_layers_config was not setted or if list len is 'le' than one
		"""
        print('self.__encoder_layers_config ', self.__encoder_layers_config)

        if not self.__encoder_layers_config:
            raise ValueError(
                'A list with the numbers of neurons in each layer is required.'
            )

        if len(self.__encoder_layers_config) <= 1:
            raise ValueError(
                'To generate an autoencoder you have to provide at least 2 layers (two items of a list).'
            )

    def __generateEncoder(self, input):

        for id, neurons in enumerate(self.__encoder_layers_config[1:]):
            if id == 0:
                self.__encoder_layers = Dense(
                    neurons,
                    activation=self.__hidden_layers_activation,
                    name='enc{}_{}'.format(id, neurons))(input)
            else:
                self.__encoder_layers = Dense(
                    neurons,
                    activation=self.__hidden_layers_activation,
                    name='enc{}_{}'.format(id, neurons))(self.__encoder_layers)

        self.__encoder_model = Model(inputs=[input],
                                     outputs=[self.__encoder_layers])

    def __generateDecoder(self):

        reversed_encoder_layers = self.__encoder_layers_config[:-1]

        for id, neurons in enumerate(reversed(reversed_encoder_layers)):
            if id == 0:
                self.__decoder_layers = Dense(
                    neurons,
                    activation=self.__hidden_layers_activation,
                    name='dec{}_{}'.format(id, neurons))(self.__encoder_layers)
            else:
                decoder_activation = ''

                if id == len(self.__encoder_layers_config[:-1]) - 1:
                    decoder_activation = self.__output_layer_activation

                else:
                    decoder_activation = self.__hidden_layers_activation

                self.__decoder_layers = Dense(
                    neurons,
                    activation=decoder_activation,
                    name='dec{}_{}'.format(id, neurons))(self.__decoder_layers)

    def __generateModels(self):

        self.__validateEncoderLayers()

        input = Input(shape=(self.__encoder_layers_config[0], ))

        self.__generateEncoder(input)
        self.__generateDecoder()

        self.__autoencoder = Model(inputs=[input],
                                   outputs=[self.__decoder_layers])

#		if not self.__discard_decoder_model:
#			decoder_input = self.__encoder_model.layers[-1].output
#
#			self.__decoder_model = Model(inputs=[decoder_input], outputs=[self.__autoencoder.layers[-1](decoder_input)])

    def __compile(self):
        self.__autoencoder.compile(loss=self.__loss_function,
                                   optimizer=self.__optimizer,
                                   metrics=['accuracy'])
        self.__summary = self.__autoencoder.summary()

    def train_and_eval(self,
                       feature=None,
                       feature_validation=None,
                       epochs=1000,
                       batch_size=32,
                       shuffle=True,
                       store_history=True,
                       callbacks=None):

        validation_data = None

        #if not feature_validation == None:
        validation_data = (feature_validation, feature_validation)

        h = self.__autoencoder.fit(x=feature,
                                   y=feature,
                                   validation_data=validation_data,
                                   shuffle=shuffle,
                                   epochs=epochs,
                                   batch_size=batch_size,
                                   callbacks=callbacks)

        if store_history:
            self.__history = h

        self.__trained = True
        self.__validated = True

    def __stats(self):

        file_pattern = 'reports/' + self.__name + '.{0}.{1}'

        classifier_predictions_max = np.argmax(self.__classifier_predictions,
                                               axis=1)
        Ymax = np.argmax(self.__eval_label, axis=1)

        self.__confusion_matrix = ConfusionMatrix(Ymax,
                                                  classifier_predictions_max)

        self.__status_dump(file_pattern,
                           self.__confusion_matrix,
                           html=True,
                           string=True,
                           pickle=True,
                           stats_as_txt=True,
                           latex=True)

    def __status_dump(self,
                      file_pattern,
                      confusion_matrix,
                      html=False,
                      string=False,
                      pickle=False,
                      stats_as_txt=False,
                      latex=False):
        dataframe = self.__confusion_matrix.to_dataframe()

        print('Report!')
        print('file_pattern ', file_pattern)

        if html:
            with open(file_pattern.format('confusion_matrix', '.html'),
                      'w') as file:
                file.write(dataframe.to_html())
        if string:
            with open(file_pattern.format('confusion_matrix', '.txt'),
                      'w') as file:
                file.write(dataframe.to_string())
        if pickle:
            dataframe.to_pickle(
                file_pattern.format('confusion_matrix', '.pickle'))

        if stats_as_txt:
            with open(file_pattern.format('stats', '.txt'), 'w') as file:
                file.write(str(confusion_matrix.stats()))
        if latex:
            with open(file_pattern.format('confusion_matrix', '.latex_table'),
                      'w') as file:
                file.write(dataframe.to_latex())

    def get_classifier(self,
                       activation=None,
                       loss_function=None,
                       optimizer=None,
                       use_last_dim_as_classifier_dim=None,
                       classifier_dim=None):
        if self.__trained and self.__validated:
            classifier = EncoderWithClassifier(
                self.__encoder_model,
                name=self.__name + '_classifier',
                activation=activation,
                loss_function=loss_function,
                optimizer=optimizer,
                use_last_dim_as_classifier_dim=use_last_dim_as_classifier_dim,
                classifier_dim=classifier_dim)
            return classifier
        else:
            logging.info(
                "impossible to create a classifier. Autoencoder isn't trained or validated!"
            )
        return None

    def eval(self, feature=None):
        self.__eval_feature = feature
        self.__classifier_predictions = self.__classifier.predict(feature)

    def eval_stats(self):
        self.__stats()

    @property
    def encoder_model(self):
        return self.__encoder_model

    @property
    def decoder_model(self):
        return self.__decoder_model

    @property
    def autoencoder(self):
        return self.__autoencoder

    @property
    def training_history(self):
        return self.__history

    @property
    def summary(self):
        return self.__summary