def get_roc_auc(labels, score, verbose=True): """return area under ROC curve Parameters ---------- labels : np.ndarray vector of ground truth score : np.ndarray vector of scores assigned by classifier (i.e. clf.pred_proba(...)[-1] in sklearn) verbose : boolean iff True, prints area under the curve Returns ------- float area under the curve """ labels = utils.check_col(labels, argument_name='labels') score = utils.check_col(score, argument_name='score') auc_score = roc_auc_score(labels, score) if verbose: print 'ROC AUC: {}'.format(auc_score) return auc_score
def plot_prec_recall(labels, score, title='Prec/Recall', verbose=True): """Plot precision/recall curve Parameters ---------- labels : np.ndarray vector of ground truth score : np.ndarray vector of scores assigned by classifier (i.e. clf.pred_proba(...)[-1] in sklearn) title : str title of plot verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot """ labels = utils.check_col(labels, argument_name='labels') score = utils.check_col(score, argument_name='score') # adapted from Rayid's prec/recall code y_true = labels y_score = score precision_curve, recall_curve, pr_thresholds = precision_recall_curve( y_true, y_score) precision_curve = precision_curve[:-1] recall_curve = recall_curve[:-1] pct_above_per_thresh = [] number_scored = len(y_score) for value in pr_thresholds: num_above_thresh = len(y_score[y_score>=value]) pct_above_thresh = num_above_thresh / float(number_scored) pct_above_per_thresh.append(pct_above_thresh) pct_above_per_thresh = np.array(pct_above_per_thresh) fig = plt.figure() ax1 = plt.gca() ax1.plot(pct_above_per_thresh, precision_curve, 'b') ax1.set_xlabel('percent of population') ax1.set_ylabel('precision', color='b') ax2 = ax1.twinx() ax2.plot(pct_above_per_thresh, recall_curve, 'r') ax2.set_ylabel('recall', color='r') plt.title(title) if verbose: fig.show() return fig
def plot_prec_recall(labels, score, title='Prec/Recall', verbose=True): """Plot precision/recall curve Parameters ---------- labels : np.ndarray vector of ground truth score : np.ndarray vector of scores assigned by classifier (i.e. clf.pred_proba(...)[-1] in sklearn) title : str title of plot verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot """ labels = utils.check_col(labels, argument_name='labels') score = utils.check_col(score, argument_name='score') # adapted from Rayid's prec/recall code y_true = labels y_score = score precision_curve, recall_curve, pr_thresholds = precision_recall_curve( y_true, y_score) precision_curve = precision_curve[:-1] recall_curve = recall_curve[:-1] pct_above_per_thresh = [] number_scored = len(y_score) for value in pr_thresholds: num_above_thresh = len(y_score[y_score >= value]) pct_above_thresh = num_above_thresh / float(number_scored) pct_above_per_thresh.append(pct_above_thresh) pct_above_per_thresh = np.array(pct_above_per_thresh) fig = plt.figure() ax1 = plt.gca() ax1.plot(pct_above_per_thresh, precision_curve, 'b') ax1.set_xlabel('percent of population') ax1.set_ylabel('precision', color='b') ax2 = ax1.twinx() ax2.plot(pct_above_per_thresh, recall_curve, 'r') ax2.set_ylabel('recall', color='r') plt.title(title) if verbose: fig.show() return fig
def test_check_col(self): valid1 = np.array([1, 2, 3, 4]) valid2 = np.array([[1.0], [2], [3], [4]]) valid3 = [3.0, 2.0, 1.8] valid4 = pd.Series(valid1) for valid in (valid1, valid2, valid3, valid4): self.assertTrue(utils.is_nd(utils.check_col(valid))) self.assertRaises(ValueError, utils.check_col, None) self.assertRaises(ValueError, utils.check_col, "lalala") self.assertRaises(ValueError, utils.check_col, np.array( [[1, 2], [3, 4]])) utils.check_col(valid1, n_rows=4) self.assertRaises(ValueError, utils.check_col, valid1, n_rows=5)
def normalize(col, mean=None, stddev=None, return_fit=False): """Generate a normalized column. Normalize both mean and std dev. Parameters ---------- col : np.ndarray mean : float or None Mean to use for fit. If none, will use 0 stddev : float or None return_fit : boolean If True, returns tuple of fitted col, mean, and standard dev of fit. If False, only returns fitted col Returns ------- np.ndarray or (np.array, float, float) """ # see infonavit for applying to different set than we fit on # https://github.com/dssg/infonavit-public/blob/master/pipeline_src/preprocessing.py#L99 # Logic is from sklearn StandardScaler, but I didn't use sklearn because # I want to pass in mean and stddev rather than a fitted StandardScaler # https://github.com/scikit-learn/scikit-learn/blob/a95203b/sklearn/preprocessing/data.py#L276 col = utils.check_col(col) if mean is None: mean = np.mean(col) if stddev is None: stddev = np.std(col) res = (col - mean) / stddev if return_fit: return (res, mean, stddev) else: return res
def plot_on_timeline(col, verbose=True): """Plots points on a timeline Parameters ---------- col : np.array verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot Returns ------- matplotlib.figure.Figure """ col = utils.check_col(col) # http://stackoverflow.com/questions/1574088/plotting-time-in-python-with-matplotlib if is_nd(col): col = col.astype(datetime) dates = matplotlib.dates.date2num(col) fig = plt.figure() plt.plot_date(dates, [0] * len(dates)) if verbose: plt.show() return fig
def plot_box_plot(col, title=None, verbose=True): """Makes a box plot for a feature Parameters ---------- col : np.array title : str or None title of a plot verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot """ col = utils.check_col(col) fig = plt.figure() boxplot(col) if title: plt.title(title) #add col_name to graphn if verbose: plt.show() return fig
def plot_simple_histogram(col, verbose=True): """Makes a histogram of values in a column Parameters ---------- col : np.ndarray verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot """ col = utils.check_col(col) override_xticks = False if col.dtype.char in ('O', 'S'): # If col is strings, handle differently counts = Counter(col) categories = sorted(counts.keys()) hist = [counts[cat] for cat in categories] bins = np.arange(len(categories) + 1) override_xticks = True else: hist, bins = np.histogram(col, bins=50) width = 0.7 * (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 f = plt.figure() plt.bar(center, hist, align='center', width=width) if override_xticks: plt.xticks(center, categories) if verbose: plt.show() return f
def crosstab(col1, col2, verbose=True): """ Makes a crosstab of col1 and col2. This is represented as a structured array with the following properties: 1. The first column is the value of col1 being crossed 2. The name of every column except the first is the value of col2 being crossed 3. To find the number of cooccurences of x from col1 and y in col2, find the row that has 'x' in col1 and the column named 'y'. The corresponding cell is the number of cooccurrences of x and y Parameters ---------- col1 : np.ndarray col2 : np.ndarray Returns ------- np.ndarray structured array """ col1 = utils.check_col(col1, argument_name='col1') col2 = utils.check_col(col2, argument_name='col2') col1_unique = np.unique(col1) col2_unique = np.unique(col2) crosstab_rows = [] for col1_val in col1_unique: loc_col1_val = np.where(col1 == col1_val)[0] col2_vals = col2[loc_col1_val] cnt = Counter(col2_vals) counts = [ cnt[col2_val] if cnt.has_key(col2_val) else 0 for col2_val in col2_unique ] crosstab_rows.append(['{}'.format(col1_val)] + counts) col_names = ['col1_value' ] + ['{}'.format(col2_val) for col2_val in col2_unique] ret = convert_to_sa(crosstab_rows, col_names=col_names) if verbose: pprint_sa(ret) return ret
def distance_from_point(lat_origin, lng_origin, lat_col, lng_col): """Generates a column of how far each record is from the origin Parameters ---------- lat_origin : number lng_origin : number lat_col : np.ndarray lng_col : np.ndarray Returns ------- np.ndarray """ lat_col = utils.check_col(lat_col, argument_name='lat_col') lng_col = utils.check_col(lng_col, argument_name='lng_col') return distance(lat_origin, lng_origin, lat_col, lng_col)
def crosstab(col1, col2, verbose=True): """ Makes a crosstab of col1 and col2. This is represented as a structured array with the following properties: 1. The first column is the value of col1 being crossed 2. The name of every column except the first is the value of col2 being crossed 3. To find the number of cooccurences of x from col1 and y in col2, find the row that has 'x' in col1 and the column named 'y'. The corresponding cell is the number of cooccurrences of x and y Parameters ---------- col1 : np.ndarray col2 : np.ndarray Returns ------- np.ndarray structured array """ col1 = utils.check_col(col1, argument_name='col1') col2 = utils.check_col(col2, argument_name='col2') col1_unique = np.unique(col1) col2_unique = np.unique(col2) crosstab_rows = [] for col1_val in col1_unique: loc_col1_val = np.where(col1==col1_val)[0] col2_vals = col2[loc_col1_val] cnt = Counter(col2_vals) counts = [cnt[col2_val] if cnt.has_key(col2_val) else 0 for col2_val in col2_unique] crosstab_rows.append(['{}'.format(col1_val)] + counts) col_names = ['col1_value'] + ['{}'.format(col2_val) for col2_val in col2_unique] ret = convert_to_sa(crosstab_rows, col_names=col_names) if verbose: pprint_sa(ret) return ret
def __init__(self, M, labels, clfs=[{ 'clf': RandomForestClassifier }], subsets=[{ 'subset': s_i.SubsetNoSubset }], cvs=[{ 'cv': KFold }], trials=None): if M is not None: if utils.is_nd(M) and not utils.is_sa(M): # nd_array, short circuit the usual type checking and coersion if M.ndim != 2: raise ValueError('Expected 2-dimensional array for M') self.M = M self.col_names = ['f{}'.format(i) for i in xrange(M.shape[1])] self.labels = utils.check_col(labels, n_rows=M.shape[0], argument_name='labels') else: # M is either a structured array or something that should # be converted (M, self.labels) = utils.check_consistent( M, labels, col_argument_name='labels') self.col_names = M.dtype.names self.M = utils.cast_np_sa_to_nd(M) else: self.col_names = None if trials is None: clfs = utils.check_arguments( clfs, {'clf': lambda clf: issubclass(clf, BaseEstimator)}, optional_keys_take_lists=True, argument_name='clfs') subsets = utils.check_arguments(subsets, { 'subset': lambda subset: issubclass(subset, s_i.BaseSubsetIter) }, optional_keys_take_lists=True, argument_name='subsets') cvs = utils.check_arguments( cvs, {'cv': lambda cv: issubclass(cv, _PartitionIterator)}, optional_keys_take_lists=True, argument_name='cvs') self.clfs = clfs self.subsets = subsets self.cvs = cvs self.trials = trials
def __init__( self, M, labels, clfs=[{'clf': RandomForestClassifier}], subsets=[{'subset': s_i.SubsetNoSubset}], cvs=[{'cv': KFold}], trials=None): if M is not None: if utils.is_nd(M) and not utils.is_sa(M): # nd_array, short circuit the usual type checking and coersion if M.ndim != 2: raise ValueError('Expected 2-dimensional array for M') self.M = M self.col_names = ['f{}'.format(i) for i in xrange(M.shape[1])] self.labels = utils.check_col( labels, n_rows=M.shape[0], argument_name='labels') else: # M is either a structured array or something that should # be converted (M, self.labels) = utils.check_consistent( M, labels, col_argument_name='labels') self.col_names = M.dtype.names self.M = utils.cast_np_sa_to_nd(M) else: self.col_names = None if trials is None: clfs = utils.check_arguments( clfs, {'clf': lambda clf: issubclass(clf, BaseEstimator)}, optional_keys_take_lists=True, argument_name='clfs') subsets = utils.check_arguments( subsets, {'subset': lambda subset: issubclass(subset, s_i.BaseSubsetIter)}, optional_keys_take_lists=True, argument_name='subsets') cvs = utils.check_arguments( cvs, {'cv': lambda cv: issubclass(cv, _PartitionIterator)}, optional_keys_take_lists=True, argument_name='cvs') self.clfs = clfs self.subsets = subsets self.cvs = cvs self.trials = trials
def plot_kernel_density(col, verbose=True): """Plots kernel density function of column From: https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/ Parameters ---------- col : np.ndarray verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot """ #address pass entire matrix # TODO respect missing_val # TODO what does n do? col = utils.check_col(col) x_grid = np.linspace(min(col), max(col), 1000) grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1, 1.0, 30)}, cv=20) # 20-fold cross-validation grid.fit(col[:, None]) kde = grid.best_estimator_ pdf = np.exp(kde.score_samples(x_grid[:, None])) fig, ax = plt.subplots() #fig = plt.figure() ax.plot(x_grid, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth) ax.hist(col, 30, fc='gray', histtype='stepfilled', alpha=0.3, normed=True) ax.legend(loc='upper left') ax.set_xlim(min(col), max(col)) if verbose: plt.show() return fig
def plot_kernel_density(col, verbose=True): """Plots kernel density function of column From: https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/ Parameters ---------- col : np.ndarray verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot """ #address pass entire matrix # TODO respect missing_val # TODO what does n do? col = utils.check_col(col) x_grid = np.linspace(min(col), max(col), 1000) grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1,1.0,30)}, cv=20) # 20-fold cross-validation grid.fit(col[:, None]) kde = grid.best_estimator_ pdf = np.exp(kde.score_samples(x_grid[:, None])) fig, ax = plt.subplots() #fig = plt.figure() ax.plot(x_grid, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth) ax.hist(col, 30, fc='gray', histtype='stepfilled', alpha=0.3, normed=True) ax.legend(loc='upper left') ax.set_xlim(min(col), max(col)) if verbose: plt.show() return fig
def table(col, verbose=True): """ Creates a summary or the number of occurrences of each value in the column Similar to R's table Parameters ---------- col :np.ndarray Returns ------- np.ndarray structured array """ col = utils.check_col(col) cnt = Counter(col) cat_and_cnt = sorted(cnt.iteritems(), key=lambda item: item[0]) ret = convert_to_sa(cat_and_cnt, col_names=('col_name', 'count')) if verbose: pprint_sa(ret) return ret
def generate_bin(col, num_bins): """Generates a column of categories, where each category is a bin. Parameters ---------- col : np.ndarray Returns ------- np.ndarray Examples -------- >>> M = np.array([0.1, 3.0, 0.0, 1.2, 2.5, 1.7, 2]) >>> generate_bin(M, 3) [0 3 0 1 2 1 2] """ col = utils.check_col(col) minimum = float(min(col)) maximum = float(max(col)) distance = float(maximum - minimum) return [int((x - minimum) / distance * num_bins) for x in col]
def plot_simple_histogram(col, verbose=True): """Makes a histogram of values in a column Parameters ---------- col : np.ndarray verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot """ col = utils.check_col(col) hist, bins = np.histogram(col, bins=50) width = 0.7 * (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 f = plt.figure() plt.bar(center, hist, align='center', width=width) if verbose: plt.show() return f