def genScatterPlot(filename, table, xIndex, yIndex, xLabel, yLabel, title): output.update("... Plot %s" % xLabel) pyplot.figure() ys = getCol(table, yIndex) xs = getCol(table, xIndex) pyplot.plot(xs, ys, 'b.', alpha=0.2) pyplot.xlabel(xLabel) pyplot.ylabel(yLabel) pyplot.suptitle(title) pyplot.savefig(PDFs + filename)
def genFrequencyGraph(filename, table, index, label, title): output.update("... Plot %s" % label) pyplot.figure() xs = getCol(table, index) pyplot.hist(xs, bins=100) pyplot.suptitle(title) pyplot.xlabel(label) pyplot.savefig(PDFs + filename)
def att_freqs(instances, att_index, class_index): """ gives the stats the distribution of class_labels given an index :param instances: a table :param att_index: the index of the attribute to get class_label stats on :param class_index: the index of the class_labels :return: {att_val:[{class1: freq, class2: freq, ...}, total], ...} """ # get unique list of attribute and class values att_vals = list(set(table_utils.getCol(instances, att_index))) class_vals = list(set(table_utils.getCol(instances, class_index))) # initialize the result result = {v: [{c: 0 for c in class_vals}, 0] for v in att_vals} # build up the frequencies for row in instances: label = row[class_index] att_val = row[att_index] result[att_val][0][label] += 1 result[att_val][1] += 1 return result
def summary(table): header = ["Attributes", "Min", "Max", "Mean", "Median"] attributes = [ "Score", "Link Ratio", "Tag Ratio", "Entities", "Sentences", "Similarity" ] summaryTable = [header] for i, att in enumerate(attributes): col = getCol(table, i) summaryTable.append([att, min(col), max(col), mean(col), median(col)]) logging.info( '\n' + str(tabulate(summaryTable, headers="firstrow", tablefmt="fancy")))
def confusion_matrix(labels, class_label_name): """ Prints the confusion matrix of the given labels :param labels: A list of tuples of class labels [(actual, predicted),...] :param class_label_name: The name of the class label """ class_labels = list(set(getCol(labels, 0))) # all the actual class labels the_headers = [class_label_name] the_headers.extend(class_labels) the_headers.extend(['Total', 'Recognition (%)']) # makes an table filled with zeros of #columns = len(the_headers) and #rows = len(class_labels) _confusion_matrix = [[0] * len(the_headers) for i in range(len(class_labels))] # fills out the confusion matrix with the predicted vs. actual for a_label_point in labels: actual, predicted = a_label_point _confusion_matrix[class_labels.index(actual)][the_headers.index( predicted)] += 1 # add the rest of the values to the confusion matrix for i in range(len(_confusion_matrix)): row = _confusion_matrix[i] # current row # adding total to the confusion matrix total = sum(row) row[the_headers.index('Total')] = total # add the total in for the row row[0] = class_labels[ i] # adds the class label for the row to the beginning of row # adding recognition to the confusion matrix (% of guesses in row that are correct recognition = row[the_headers.index(class_labels[i])] # TP recognition /= float(total) recognition *= 100 row[the_headers.index('Recognition (%)')] = recognition logging.info( '\n' + str(tabulate(_confusion_matrix, headers=the_headers, tablefmt="rst")))
def normalize_table(table, except_for=None): """ Assumes table has been cleaned of all NA values :param table: a data_table :param except_for: a list of indexes to not normalize in the table :return: A normalized table """ new_table = [[] for i in range(len(table))] indexes = range(len(table[0])) # number of indexes in a row for index in indexes: data_column = table_utils.getCol(table, index) if index not in except_for: data_column = normalized_value(data_column) # normalize data in column # puts the values of the data column into the new_table for row_index in range(len(table)): new_table[row_index].append(data_column[row_index]) return new_table