def _init_dict(self, data, axes, dtype=None): items = axes[0] # prefilter if items passed if items is not None: items = _ensure_index(items) data = dict((k, v) for k, v in data.iteritems() if k in items) else: items = Index(_try_sort(data.keys())) # figure out the index, if necessary if index is None: index = extract_index(data) # don't force copy because getting jammed in an ndarray anyway # homogenized = _homogenize(data, index, columns, dtype) data, index, columns = _homogenize(data, intersect=intersect) # segregates dtypes and forms blocks matching to columns blocks = form_blocks(homogenized, index, columns) # consolidate for now mgr = BlockManager(blocks, [columns, index]) return mgr.consolidate()
def hist_frame(data, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, sharey=False, **kwds): """ Draw Histogram the DataFrame's series using matplotlib / pylab. Parameters ---------- grid : boolean, default True Whether to show axis grid lines xlabelsize : int, default None If specified changes the x-axis label size xrot : float, default None rotation of x axis labels ylabelsize : int, default None If specified changes the y-axis label size yrot : float, default None rotation of y axis labels ax : matplotlib axes object, default None sharex : bool, if True, the X axis will be shared amongst all subplots. sharey : bool, if True, the Y axis will be shared amongst all subplots. kwds : other plotting keyword arguments To be passed to hist function """ import matplotlib.pyplot as plt n = len(data.columns) rows, cols = 1, 1 while rows * cols < n: if cols > rows: rows += 1 else: cols += 1 _, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False, sharex=sharex, sharey=sharey) for i, col in enumerate(com._try_sort(data.columns)): ax = axes[i / cols][i % cols] ax.xaxis.set_visible(True) ax.yaxis.set_visible(True) ax.hist(data[col].dropna().values, **kwds) ax.set_title(col) ax.grid(grid) if xlabelsize is not None: plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) if xrot is not None: plt.setp(ax.get_xticklabels(), rotation=xrot) if ylabelsize is not None: plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) if yrot is not None: plt.setp(ax.get_yticklabels(), rotation=yrot) for j in range(i + 1, rows * cols): ax = axes[j / cols, j % cols] ax.set_visible(False) ax.get_figure().subplots_adjust(wspace=0.3, hspace=0.3) return axes
def _sanitize_and_check(indexes): kinds = list(set([type(index) for index in indexes])) if list in kinds: if len(kinds) > 1: indexes = [Index(com._try_sort(x)) if not isinstance(x, Index) else x for x in indexes] kinds.remove(list) else: return indexes, "list" if len(kinds) > 1 or Index not in kinds: return indexes, "special" else: return indexes, "array"
def _sanitize_and_check(indexes): kinds = list(set([type(index) for index in indexes])) if list in kinds: if len(kinds) > 1: indexes = [Index(com._try_sort(x)) if not isinstance(x, Index) else x for x in indexes] kinds.remove(list) else: return indexes, 'list' if len(kinds) > 1 or Index not in kinds: return indexes, 'special' else: return indexes, 'array'
def _sanitize_and_check(indexes): kinds = list({type(index) for index in indexes}) if list in kinds: if len(kinds) > 1: indexes = [Index(com._try_sort(x)) if not isinstance(x, Index) else x for x in indexes] kinds.remove(list) else: return indexes, 'list' if len(kinds) > 1 or Index not in kinds: return indexes, 'special' else: return indexes, 'array'
def _init_dict(self, data, axes, dtype=None): items, major, minor = axes # prefilter if items passed if items is not None: items = _ensure_index(items) data = dict((k, v) for k, v in data.iteritems() if k in items) else: items = Index(_try_sort(data.keys())) for k, v in data.iteritems(): if not isinstance(v, DataFrame): data[k] = DataFrame(v) if major is None: indexes = [v.index for v in data.values()] major = _union_indexes(indexes) if minor is None: indexes = [v.columns for v in data.values()] minor = _union_indexes(indexes) axes = [items, major, minor] reshaped_data = data.copy() # shallow # homogenize item_shape = (1, len(major), len(minor)) for k in items: if k not in data: values = np.empty(item_shape, dtype=dtype) values.fill(np.nan) reshaped_data[k] = values else: v = data[k] v = v.reindex(index=major, columns=minor, copy=False) if dtype is not None: v = v.astype(dtype) values = v.values shape = values.shape reshaped_data[k] = values.reshape((1,) + shape) # segregates dtypes and forms blocks matching to columns blocks = form_blocks(reshaped_data, axes) mgr = BlockManager(blocks, axes).consolidate() return mgr
def compare_variables(self, cuts= None, **kwargs): # data1: DataFrame # data2: DataFrame # column: string or sequence # If passed, will be used to limit data to a subset of columns if 'alpha' not in self.kwargs: kwargs['alpha'] = 0.5 # if column is not None: # if not isinstance(column, (list, np.ndarray, Index)): # column = [column] # data1 = data1[column] # data2 = data2[column] if cuts != None: if passed_cut (evt, cuts) self.data1 = self.data1._get_numeric_data() self.data2 = self.data2._get_numeric_data() naxes = len(self.data1.columns) fig, axes = plotting._subplots(naxes=naxes, ax=ax, squeeze=False, sharex=sharex, sharey=sharey, figsize=figsize, layout=layout) _axes = plotting._flatten(axes) for i, col in enumerate(com._try_sort(self.data1.columns)): ax = _axes[i] low = min(data1[col].min(), data2[col].min()) high = max(data1[col].max(), data2[col].max()) ax.hist(data1[col].dropna().values, bins=bins, range=(low,high), **kwds) ax.hist(data2[col].dropna().values, bins=bins, range=(low,high), **kwds) ax.set_title(col) ax.grid(grid) plotting._set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot) fig.subplots_adjust(wspace=0.3, hspace=0.7) return axes
def hist_frame(data, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, **kwds): """ Draw Histogram the DataFrame's series using matplotlib / pylab. Parameters ---------- grid : boolean, default True Whether to show axis grid lines xlabelsize : int, default None If specified changes the x-axis label size xrot : float, default None rotation of x axis labels ylabelsize : int, default None If specified changes the y-axis label size yrot : float, default None rotation of y axis labels ax : matplotlib axes object, default None kwds : other plotting keyword arguments To be passed to hist function """ import matplotlib.pyplot as plt n = len(data.columns) k = 1 while k ** 2 < n: k += 1 _, axes = _subplots(nrows=k, ncols=k, ax=ax) for i, col in enumerate(com._try_sort(data.columns)): ax = axes[i / k][i % k] ax.hist(data[col].dropna().values, **kwds) ax.set_title(col) ax.grid(grid) if xlabelsize is not None: plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) if xrot is not None: plt.setp(ax.get_xticklabels(), rotation=xrot) if ylabelsize is not None: plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) if yrot is not None: plt.setp(ax.get_yticklabels(), rotation=yrot) return axes
def _init_dict(self, data, index, columns, dtype=None): # pre-filter out columns if we passed it if columns is not None: columns = _ensure_index(columns) data = dict( (k, v) for k, v in compat.iteritems(data) if k in columns) else: columns = Index(_try_sort(list(data.keys()))) if index is None: index = extract_index(list(data.values())) sp_maker = lambda x: SparseArray(x, kind=self._default_kind, fill_value=self._default_fill_value, copy=True, dtype=dtype) sdict = DataFrame() for k, v in compat.iteritems(data): if isinstance(v, Series): # Force alignment, no copy necessary if not v.index.equals(index): v = v.reindex(index) if not isinstance(v, SparseSeries): v = sp_maker(v.values) elif isinstance(v, SparseArray): v = v.copy() else: if isinstance(v, dict): v = [v.get(i, nan) for i in index] v = sp_maker(v) sdict[k] = v # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) nan_vec = np.empty(len(index)) nan_vec.fill(nan) for c in columns: if c not in sdict: sdict[c] = sp_maker(nan_vec) return to_manager(sdict, columns, index)
def _iter_data(self): from pandas.core.frame import DataFrame if isinstance(self.data, (Series, np.ndarray)): yield com._stringify(self.label), np.asarray(self.data) elif isinstance(self.data, DataFrame): df = self.data if self.sort_columns: columns = com._try_sort(df.columns) else: columns = df.columns for col in columns: empty = df[col].count() == 0 # is this right? values = df[col].values if not empty else np.zeros(len(df)) col = com._stringify(col) yield col, values
def _init_dict(self, data, axes, dtype=None): haxis = axes.pop(self._info_axis_number) # prefilter if haxis passed if haxis is not None: haxis = _ensure_index(haxis) data = OrderedDict( (k, v) for k, v in compat.iteritems(data) if k in haxis) else: ks = list(data.keys()) if not isinstance(data, OrderedDict): ks = _try_sort(ks) haxis = Index(ks) for k, v in compat.iteritems(data): if isinstance(v, dict): data[k] = self._constructor_sliced(v) # extract axis for remaining axes & create the slicemap raxes = [ self._extract_axis(self, data, axis=i) if a is None else a for i, a in enumerate(axes) ] raxes_sm = self._extract_axes_for_slice(self, raxes) # shallow copy arrays = [] haxis_shape = [len(a) for a in raxes] for h in haxis: v = values = data.get(h) if v is None: values = np.empty(haxis_shape, dtype=dtype) values.fill(np.nan) elif isinstance(v, self._constructor_sliced): d = raxes_sm.copy() d['copy'] = False v = v.reindex(**d) if dtype is not None: v = v.astype(dtype) values = v.values arrays.append(values) return self._init_arrays(arrays, haxis, [haxis] + raxes)
def _init_dict(self, data, index, columns, dtype=None): # pre-filter out columns if we passed it if columns is not None: columns = _ensure_index(columns) data = dict((k, v) for k, v in compat.iteritems(data) if k in columns) else: columns = Index(_try_sort(list(data.keys()))) if index is None: index = extract_index(list(data.values())) sp_maker = lambda x: SparseArray(x, kind=self._default_kind, fill_value=self._default_fill_value, copy=True) sdict = {} for k, v in compat.iteritems(data): if isinstance(v, Series): # Force alignment, no copy necessary if not v.index.equals(index): v = v.reindex(index) if not isinstance(v, SparseSeries): v = sp_maker(v.values) elif isinstance(v, SparseArray): v = sp_maker(v.values) else: if isinstance(v, dict): v = [v.get(i, nan) for i in index] v = sp_maker(v) sdict[k] = v # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) nan_vec = np.empty(len(index)) nan_vec.fill(nan) for c in columns: if c not in sdict: sdict[c] = sp_maker(nan_vec) return dict_to_manager(sdict, columns, index)
def _init_dict(self, data, axes, dtype=None): haxis = axes.pop(self._info_axis_number) # prefilter if haxis passed if haxis is not None: haxis = _ensure_index(haxis) data = OrderedDict((k, v) for k, v in compat.iteritems(data) if k in haxis) else: ks = list(data.keys()) if not isinstance(data, OrderedDict): ks = _try_sort(ks) haxis = Index(ks) for k, v in compat.iteritems(data): if isinstance(v, dict): data[k] = self._constructor_sliced(v) # extract axis for remaining axes & create the slicemap raxes = [self._extract_axis(self, data, axis=i) if a is None else a for i, a in enumerate(axes)] raxes_sm = self._extract_axes_for_slice(self, raxes) # shallow copy arrays = [] haxis_shape = [len(a) for a in raxes] for h in haxis: v = values = data.get(h) if v is None: values = np.empty(haxis_shape, dtype=dtype) values.fill(np.nan) elif isinstance(v, self._constructor_sliced): d = raxes_sm.copy() d['copy'] = False v = v.reindex(**d) if dtype is not None: v = v.astype(dtype) values = v.values arrays.append(values) return self._init_arrays(arrays, haxis, [haxis] + raxes)
def _init_dict(self, data, axes, dtype=None): items, major, minor = axes # prefilter if items passed if items is not None: items = _ensure_index(items) data = dict((k, v) for k, v in data.iteritems() if k in items) else: items = Index(_try_sort(data.keys())) for k, v in data.iteritems(): if isinstance(v, dict): data[k] = DataFrame(v) if major is None: major = _extract_axis(data, axis=0) if minor is None: minor = _extract_axis(data, axis=1) axes = [items, major, minor] reshaped_data = data.copy() # shallow item_shape = len(major), len(minor) for item in items: v = values = data.get(item) if v is None: values = np.empty(item_shape, dtype=dtype) values.fill(np.nan) elif isinstance(v, DataFrame): v = v.reindex(index=major, columns=minor, copy=False) if dtype is not None: v = v.astype(dtype) values = v.values reshaped_data[item] = values # segregates dtypes and forms blocks matching to columns blocks = form_blocks(reshaped_data, axes) mgr = BlockManager(blocks, axes).consolidate() return mgr
def hist_frame(data, grid=True, **kwds): """ Draw Histogram the DataFrame's series using matplotlib / pylab. Parameters ---------- kwds : other plotting keyword arguments To be passed to hist function """ n = len(data.columns) k = 1 while k ** 2 < n: k += 1 _, axes = _subplots(nrows=k, ncols=k) for i, col in enumerate(com._try_sort(data.columns)): ax = axes[i / k][i % k] ax.hist(data[col].dropna().values, **kwds) ax.set_title(col) ax.grid(grid) return axes
def _init_dict(self, data, axes, dtype=None): items, major, minor = axes # prefilter if items passed if items is not None: items = _ensure_index(items) data = dict((k, v) for k, v in data.iteritems() if k in items) else: items = Index(_try_sort(data.keys())) for k, v in data.iteritems(): if isinstance(v, dict): data[k] = DataFrame(v) if major is None: major = _extract_axis(data, axis=0) if minor is None: minor = _extract_axis(data, axis=1) axes = [items, major, minor] arrays = [] item_shape = len(major), len(minor) for item in items: v = values = data.get(item) if v is None: values = np.empty(item_shape, dtype=dtype) values.fill(np.nan) elif isinstance(v, DataFrame): v = v.reindex(index=major, columns=minor, copy=False) if dtype is not None: v = v.astype(dtype) values = v.values arrays.append(values) return self._init_arrays(arrays, items, axes)
def hist_frame(data, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, **kwds): """ Draw Histogram the DataFrame's series using matplotlib / pylab. Parameters ---------- grid : boolean, default True Whether to show axis grid lines xlabelsize : int, default None If specified changes the x-axis label size xrot : float, default None rotation of x axis labels ylabelsize : int, default None If specified changes the y-axis label size yrot : float, default None rotation of y axis labels ax : matplotlib axes object, default None kwds : other plotting keyword arguments To be passed to hist function """ import matplotlib.pyplot as plt n = len(data.columns) rows, cols = 1, 1 while rows * cols < n: if cols > rows: rows += 1 else: cols += 1 _, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False) for i, col in enumerate(com._try_sort(data.columns)): ax = axes[i / cols][i % cols] ax.xaxis.set_visible(True) ax.yaxis.set_visible(True) ax.hist(data[col].dropna().values, **kwds) ax.set_title(col) ax.grid(grid) if xlabelsize is not None: plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) if xrot is not None: plt.setp(ax.get_xticklabels(), rotation=xrot) if ylabelsize is not None: plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) if yrot is not None: plt.setp(ax.get_yticklabels(), rotation=yrot) for j in range(i + 1, rows * cols): ax = axes[j / cols, j % cols] ax.set_visible(False) ax.get_figure().subplots_adjust(wspace=0.3, hspace=0.3) return axes
def signal_background(self, data1, data2, column=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, sharey=False, figsize=None, layout=None, bins=10, sample_weights='globalTimesEventWeight', **kwds): """Draw histogram of the DataFrame's series comparing the distribution in `data1` to `data2`. data1: DataFrame data2: DataFrame column: string or sequence If passed, will be used to limit data to a subset of columns grid : boolean, default True Whether to show axis grid lines xlabelsize : int, default None If specified changes the x-axis label size xrot : float, default None rotation of x axis labels ylabelsize : int, default None If specified changes the y-axis label size yrot : float, default None rotation of y axis labels ax : matplotlib axes object, default None sharex : bool, if True, the X axis will be shared amongst all subplots. sharey : bool, if True, the Y axis will be shared amongst all subplots. figsize : tuple The size of the figure to create in inches by default layout: (optional) a tuple (rows, columns) for the layout of the histograms bins: integer, default 10 Number of histogram bins to be used kwds : other plotting keyword arguments To be passed to hist function """ manification=20 # NOTE: All sample weights for both signal and background are set to 1 for now background_weight = pd.DataFrame(1., index=np.arange(data1.shape[0]), columns=['sample_weights']) signal_weight = pd.DataFrame(1., index=np.arange(data2.shape[0]), columns=['sample_weights'])*manification if "alpha" not in kwds: kwds["alpha"] = 0.5 w, h = (10, 6) figsize = (w, h) if column is not None: if not isinstance(column, (list, np.ndarray, Index)): column = [column] data1 = data1[column] data2 = data2[column] data1 = data1._get_numeric_data() data2 = data2._get_numeric_data() naxes = len(data1.columns) self.fig, self.axes = plt.subplots(nrows=4, ncols=4, squeeze=False,figsize=figsize) #xs = plotting._flatten(axes) self.xs = self.axes.flat for i, col in enumerate(com._try_sort(data1.columns)): self.ax = self.xs[i] low = min(data1[col].min(), data2[col].min()) high = max(data1[col].max(), data2[col].max()) self.ax.hist(data1[col].dropna().values, weights=background_weight, bins=bins, histtype='stepfilled', range=(low,high), **kwds) self.ax.hist(data2[col].dropna().values, weights=signal_weight, bins=bins, histtype='stepfilled', range=(low,high), **kwds) self.ax.set_title(col) self.ax.legend(['background', 'signal (%s)'% (manification)], loc='best') self.ax.set_facecolor('white') # Customize the major grid self.ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray') self.ax.set_facecolor('white') #plotting._set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, # ylabelsize=ylabelsize, yrot=yrot) self.fig.subplots_adjust(wspace=0.5, hspace=0.8) return plt.show()