def get_data_famafrench(name): # path of zip files zip_file_path = '{0}/{1}.zip'.format(_FAMAFRENCH_URL, name) with urlopen(zip_file_path) as url: raw = url.read() with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: data = zf.open(zf.namelist()[0]).readlines() line_lengths = np.array(lmap(len, data)) file_edges = np.where(line_lengths == 2)[0] datasets = {} edges = zip(file_edges + 1, file_edges[1:]) for i, (left_edge, right_edge) in enumerate(edges): dataset = [d.split() for d in data[left_edge:right_edge]] if len(dataset) > 10: ncol_raw = np.array(lmap(len, dataset)) ncol = np.median(ncol_raw) header_index = np.where(ncol_raw == ncol - 1)[0][-1] header = dataset[header_index] ds_header = dataset[header_index + 1:] # to ensure the header is unique header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, start=1)] index = np.array([d[0] for d in ds_header], dtype=int) dataset = np.array([d[1:] for d in ds_header], dtype=float) datasets[i] = DataFrame(dataset, index, columns=header) return datasets
def test_bar_colors(self): import matplotlib.pyplot as plt import matplotlib.colors as colors default_colors = plt.rcParams.get('axes.color_cycle') custom_colors = 'rgcby' df = DataFrame(randn(5, 5)) ax = df.plot(kind='bar') rects = ax.patches conv = colors.colorConverter for i, rect in enumerate(rects[::5]): xp = conv.to_rgba(default_colors[i % len(default_colors)]) rs = rect.get_facecolor() self.assertEqual(xp, rs) tm.close() ax = df.plot(kind='bar', color=custom_colors) rects = ax.patches conv = colors.colorConverter for i, rect in enumerate(rects[::5]): xp = conv.to_rgba(custom_colors[i]) rs = rect.get_facecolor() self.assertEqual(xp, rs) tm.close() from matplotlib import cm # Test str -> colormap functionality ax = df.plot(kind='bar', colormap='jet') rects = ax.patches rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5)) for i, rect in enumerate(rects[::5]): xp = rgba_colors[i] rs = rect.get_facecolor() self.assertEqual(xp, rs) tm.close() # Test colormap functionality ax = df.plot(kind='bar', colormap=cm.jet) rects = ax.patches rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5)) for i, rect in enumerate(rects[::5]): xp = rgba_colors[i] rs = rect.get_facecolor() self.assertEqual(xp, rs) tm.close() df.ix[:, [0]].plot(kind='bar', color='DodgerBlue')
def test_line_colors(self): import matplotlib.pyplot as plt import sys from matplotlib import cm custom_colors = 'rgcby' plt.close('all') df = DataFrame(randn(5, 5)) ax = df.plot(color=custom_colors) lines = ax.get_lines() for i, l in enumerate(lines): xp = custom_colors[i] rs = l.get_color() self.assert_(xp == rs) tmp = sys.stderr sys.stderr = StringIO() try: plt.close('all') ax2 = df.plot(colors=custom_colors) lines2 = ax2.get_lines() for l1, l2 in zip(lines, lines2): self.assert_(l1.get_color(), l2.get_color()) finally: sys.stderr = tmp plt.close('all') ax = df.plot(colormap='jet') rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) lines = ax.get_lines() for i, l in enumerate(lines): xp = rgba_colors[i] rs = l.get_color() self.assert_(xp == rs) plt.close('all') ax = df.plot(colormap=cm.jet) rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) lines = ax.get_lines() for i, l in enumerate(lines): xp = rgba_colors[i] rs = l.get_color() self.assert_(xp == rs) # make color a list if plotting one column frame # handles cases like df.plot(color='DodgerBlue') plt.close('all') df.ix[:, [0]].plot(color='DodgerBlue')
def test_to_csv_from_csv3(self): with ensure_clean('__tmp_to_csv_from_csv3__') as path: df1 = DataFrame(np.random.randn(3, 1)) df2 = DataFrame(np.random.randn(3, 1)) df1.to_csv(path) df2.to_csv(path, mode='a', header=False) xp = pd.concat([df1, df2]) rs = pd.read_csv(path, index_col=0) rs.columns = lmap(int, rs.columns) xp.columns = lmap(int, xp.columns) assert_frame_equal(xp, rs)
def test_andrews_curves(self): from pandas.tools.plotting import andrews_curves from matplotlib import cm df = self.iris _check_plot_works(andrews_curves, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=rgba) self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=cnames) self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) length = 10 df = DataFrame({"A": random.rand(length), "B": random.rand(length), "C": random.rand(length), "Name": ["A"] * length}) _check_plot_works(andrews_curves, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=rgba) self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=cnames) self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) colors = ['b', 'g', 'r'] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) ax = andrews_curves(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): andrews_curves(data=df, class_column='Name')
def _expand_elements(body): lens = Series(lmap(len, body)) lens_max = lens.max() not_max = lens[lens != lens_max] for ind, length in iteritems(not_max): body[ind] += [np.nan] * (lens_max - length)
def test_split_ranges(): def _bin(x, width): "return int(x) as a base2 string of given width" return ''.join(str((x >> i) & 1) for i in range(width - 1, -1, -1)) def test_locs(mask): nfalse = sum(np.array(mask) == 0) remaining = 0 for s, e in com.split_ranges(mask): remaining += e - s assert 0 not in mask[s:e] # make sure the total items covered by the ranges are a complete cover assert remaining + nfalse == len(mask) # exhaustively test all possible mask sequences of length 8 ncols = 8 for i in range(2 ** ncols): cols = lmap(int, list(_bin(i, ncols))) # count up in base2 mask = [cols[i] == 1 for i in range(len(cols))] test_locs(mask) # base cases test_locs([]) test_locs([0]) test_locs([1])
def _preparse(source, f=compose(_replace_locals, _replace_booleans, _rewrite_assign)): """Compose a collection of tokenization functions Parameters ---------- source : str A Python source code string f : callable This takes a tuple of (toknum, tokval) as its argument and returns a tuple with the same structure but possibly different elements. Defaults to the composition of ``_rewrite_assign``, ``_replace_booleans``, and ``_replace_locals``. Returns ------- s : str Valid Python source code Notes ----- The `f` parameter can be any callable that takes *and* returns input of the form ``(toknum, tokval)``, where ``toknum`` is one of the constants from the ``tokenize`` module and ``tokval`` is a string. """ assert callable(f), 'f must be callable' return tokenize.untokenize(lmap(f, tokenize_string(source)))
def test_parse_dates_column_list(self): from pandas.core.datetools import to_datetime data = '''date;destination;ventilationcode;unitcode;units;aux_date 01/01/2010;P;P;50;1;12/1/2011 01/01/2010;P;R;50;1;13/1/2011 15/01/2010;P;P;50;1;14/1/2011 01/05/2010;P;P;50;1;15/1/2011''' expected = self.read_csv(StringIO(data), sep=";", index_col=lrange(4)) lev = expected.index.levels[0] levels = list(expected.index.levels) levels[0] = lev.to_datetime(dayfirst=True) # hack to get this to work - remove for final test levels[0].name = lev.name expected.index.set_levels(levels, inplace=True) expected['aux_date'] = to_datetime(expected['aux_date'], dayfirst=True) expected['aux_date'] = lmap(Timestamp, expected['aux_date']) tm.assertIsInstance(expected['aux_date'][0], datetime) df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4), parse_dates=[0, 5], dayfirst=True) tm.assert_frame_equal(df, expected) df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4), parse_dates=['date', 'aux_date'], dayfirst=True) tm.assert_frame_equal(df, expected)
def test_radviz(self): from pandas.tools.plotting import radviz from matplotlib import cm df = self.iris _check_plot_works(radviz, df, 'Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(radviz, df, 'Name', color=rgba) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches[:10], facecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] _check_plot_works(radviz, df, 'Name', color=cnames) patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) _check_plot_works(radviz, df, 'Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) colors = [[0., 0., 1., 1.], [0., 0.5, 1., 1.], [1., 0., 0., 1.]] df = DataFrame({"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ['b', 'g', 'r']}) ax = radviz(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors)
def _parse_raw_thead(self, table): thead = self._parse_thead(table) res = [] if thead: res = lmap(self._text_getter, self._parse_th(thead[0])) return np.atleast_1d( np.array(res).squeeze()) if res and len(res) == 1 else res
def _parse_raw_tfoot(self, table): tfoot = self._parse_tfoot(table) res = [] if tfoot: res = lmap(self._text_getter, self._parse_td(tfoot[0])) return np.atleast_1d( np.array(res).squeeze()) if res and len(res) == 1 else res
def test_lmap(self): func = lambda x, y, z: x + y + z lst = [builtins.range(10), builtins.range(10), builtins.range(10)] results = lmap(func, *lst), expecteds = list(builtins.map(func, *lst)), lengths = 10, self.check_results(results, expecteds, lengths)
def _get_label_to_i_dict(labels, sort_labels=False): """ Return OrderedDict of unique labels to number. Optionally sort by label. """ labels = Index(lmap(tuple, labels)).unique().tolist() # squish if sort_labels: labels = sorted(list(labels)) d = OrderedDict((k, i) for i, k in enumerate(labels)) return(d)
def _expand_elements(body): lens = Series(lmap(len, body)) lens_max = lens.max() not_max = lens[lens != lens_max] empty = [''] for ind, length in not_max.items(): body[ind] += empty * (lens_max - length)
def _col_size(self, k=None): """Calculate size of a data record.""" if len(self.col_sizes) == 0: self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist) if k is None: return self.col_sizes else: return self.col_sizes[k]
def test_map(self): func = lambda x, y, z: x + y + z lst = [builtins.range(10), builtins.range(10), builtins.range(10)] actual1 = map(func, *lst) actual2 = lmap(func, *lst) actual = [actual1, actual2], expected = list(builtins.map(func, *lst)), lengths = 10, self.check_result(actual, expected, lengths)
def inner(x): from pandas.io.formats.printing import pprint_thing as pp if x not in legal_values: if not any([c(x) for c in callables]): pp_values = pp("|".join(lmap(pp, legal_values))) msg = "Value must be one of {pp_values}" if len(callables): msg += " or a callable" raise ValueError(msg.format(pp_values=pp_values))
def _parse_raw_thead(self, table): thead = self._parse_thead(table) res = [] if thead: trs = self._parse_tr(thead[0]) for tr in trs: cols = lmap(self._text_getter, self._parse_td(tr)) if any([col != '' for col in cols]): res.append(cols) return res
def get_results_df(db, rev): """Takes a git commit hash and returns a Dataframe of benchmark results """ bench = DataFrame(db.get_benchmarks()) results = DataFrame(lmap(list,db.get_rev_results(rev).values())) # Sinch vbench.db._reg_rev_results returns an unlabeled dict, # we have to break encapsulation a bit. results.columns = list(db._results.c.keys()) results = results.join(bench['name'], on='checksum').set_index("checksum") return results
def convert_score(x): x = x.strip() if not x: return np.nan if x.find('-') > 0: valmin, valmax = lmap(int, x.split('-')) val = 0.5 * (valmin + valmax) else: val = float(x) return val
def test_iloc_mask(self): # GH 3631, iloc with a mask (of a series) should raise df = DataFrame(lrange(5), list('ABCDE'), columns=['a']) mask = (df.a%2 == 0) self.assertRaises(ValueError, df.iloc.__getitem__, tuple([mask])) mask.index = lrange(len(mask)) self.assertRaises(NotImplementedError, df.iloc.__getitem__, tuple([mask])) # ndarray ok result = df.iloc[np.array([True] * len(mask),dtype=bool)] assert_frame_equal(result,df) # the possibilities locs = np.arange(4) nums = 2**locs reps = lmap(bin, nums) df = DataFrame({'locs':locs, 'nums':nums}, reps) expected = { (None,'') : '0b1100', (None,'.loc') : '0b1100', (None,'.iloc') : '0b1100', ('index','') : '0b11', ('index','.loc') : '0b11', ('index','.iloc') : 'iLocation based boolean indexing cannot use an indexable as a mask', ('locs','') : 'Unalignable boolean Series key provided', ('locs','.loc') : 'Unalignable boolean Series key provided', ('locs','.iloc') : 'iLocation based boolean indexing on an integer type is not available', } import warnings warnings.filterwarnings(action='ignore', category=UserWarning) result = dict() for idx in [None, 'index', 'locs']: mask = (df.nums>2).values if idx: mask = Series(mask, list(reversed(getattr(df, idx)))) for method in ['', '.loc', '.iloc']: try: if method: accessor = getattr(df, method[1:]) else: accessor = df ans = str(bin(accessor[mask]['nums'].sum())) except Exception as e: ans = str(e) key = tuple([idx,method]) r = expected.get(key) if r != ans: raise AssertionError("[%s] does not match [%s], received [%s]" % (key,ans,r)) warnings.filterwarnings(action='always', category=UserWarning)
def _get_one(self, name, *args, **kwargs): url = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/{name}.zip'\ .format(name=name) response = self.session.get(url) raw = response.content # returns bytes (.text returns unicode ; .content returns byte) if response.status_code!=200: raise IOError("Failed to get the data. Check that {0!r} is " "a valid FamaFrench dataset.".format(name)) with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: data = zf.open(zf.namelist()[0]).readlines() line_lengths = np.array(lmap(len, data)) file_edges = np.where(line_lengths == 2)[0] datasets = {} edges = zip(file_edges + 1, file_edges[1:]) for i, (left_edge, right_edge) in enumerate(edges): dataset = [d.split() for d in data[left_edge:right_edge]] if len(dataset) > 10: ncol_raw = np.array(lmap(len, dataset)) ncol = np.median(ncol_raw) header_index = np.where(ncol_raw == ncol - 1)[0][-1] header = dataset[header_index] ds_header = dataset[header_index + 1:] # to ensure the header is unique header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, start=1)] index = np.array([d[0] for d in ds_header], dtype=int) dataset = np.array([d[1:] for d in ds_header], dtype=float) datasets[i] = pd.DataFrame(dataset, index, columns=header) return datasets
def test_plot_single_color(self): # Example from #20585. All 3 bars should have the same color df = DataFrame({'account-start': ['2017-02-03', '2017-03-03', '2017-01-01'], 'client': ['Alice Anders', 'Bob Baker', 'Charlie Chaplin'], 'balance': [-1432.32, 10.43, 30000.00], 'db-id': [1234, 2424, 251], 'proxy-id': [525, 1525, 2542], 'rank': [52, 525, 32], }) ax = df.client.value_counts().plot.bar() colors = lmap(lambda rect: rect.get_facecolor(), ax.get_children()[0:3]) assert all(color == colors[0] for color in colors)
def test_map_with_string_constructor(self): raw = [2005, 2007, 2009] index = PeriodIndex(raw, freq='A') expected = Index(lmap(str, raw)) res = index.map(str) # should return an Index assert isinstance(res, Index) # preserve element types assert all(isinstance(resi, str) for resi in res) # lastly, values should compare equal tm.assert_index_equal(res, expected)
def compute_autocorrelation(data): # from pandas.tools.plotting import autocorrelation_plot # see http://pandas.pydata.org/pandas-docs/dev/visualization.html#autocorrelation-plot # see http://www.itl.nist.gov/div898/handbook/eda/section3/autocopl.htm from pandas.compat import lmap n = len(data) mean = np.mean(data) c0 = np.sum((data - mean) ** 2) / float(n) def r(h): return ((data[:n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 x = np.arange(n) + 1 y = lmap(r, x) return np.asarray(y, dtype=np.float32)
def applymap(self, func): """ Apply a function to a DataFrame that is intended to operate elementwise, i.e. like doing map(func, series) for each series in the DataFrame Parameters ---------- func : function Python function, returns a single value from a single value Returns ------- applied : DataFrame """ return self.apply(lambda x: lmap(func, x))
def test_parallel_coordinates(self): from pandas.plotting import parallel_coordinates from matplotlib import cm df = self.iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name') nlines = len(ax.get_lines()) nxticks = len(ax.xaxis.get_ticklabels()) rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', color=rgba) self._check_colors( ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', color=cnames) self._check_colors( ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) self._check_colors( ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', axvlines=False) assert len(ax.get_lines()) == (nlines - nxticks) colors = ['b', 'g', 'r'] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) ax = parallel_coordinates(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): parallel_coordinates(data=df, class_column='Name') with tm.assert_produces_warning(FutureWarning): parallel_coordinates(df, 'Name', colors=colors)
def to_arrays(data, columns, coerce_float=False, dtype=None): """ Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): if columns is not None: arrays = [data._ixs(i, axis=1).values for i, col in enumerate(data.columns) if col in columns] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns if not len(data): if isinstance(data, np.ndarray): columns = data.dtype.names if columns is not None: return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] if isinstance(data[0], (list, tuple)): return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], compat.Mapping): return _list_of_dict_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], ABCSeries): return _list_of_series_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], Categorical): if columns is None: columns = ibase.default_index(len(data)) return data, columns elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and data.dtype.names is not None): columns = list(data.dtype.names) arrays = [data[k] for k in columns] return arrays, columns else: # last ditch effort data = lmap(tuple, data) return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
def autocorrelation_plot(series, ax=None, **kwds): """ Autocorrelation plot for time series. Parameters: ----------- series: Time series ax: Matplotlib axis object, optional kwds : keywords Options to pass to matplotlib plotting method Returns: ----------- class:`matplotlib.axis.Axes` """ import matplotlib.pyplot as plt n = len(series) data = np.asarray(series) if ax is None: ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0)) mean = np.mean(data) c0 = np.sum((data - mean) ** 2) / float(n) def r(h): return ((data[:n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 x = np.arange(n) + 1 y = lmap(r, x) z95 = 1.959963984540054 z99 = 2.5758293035489004 ax.axhline(y=z99 / np.sqrt(n), linestyle='--', color='grey') ax.axhline(y=z95 / np.sqrt(n), color='grey') ax.axhline(y=0.0, color='black') ax.axhline(y=-z95 / np.sqrt(n), color='grey') ax.axhline(y=-z99 / np.sqrt(n), linestyle='--', color='grey') ax.set_xlabel("Lag") ax.set_ylabel("Autocorrelation") ax.plot(x, y, **kwds) if 'label' in kwds: ax.legend() ax.grid() return ax
def data(self, convert_dates=True, convert_categoricals=True, index=None): """ Reads observations from Stata file, converting them into a dataframe Parameters ---------- convert_dates : boolean, defaults to True Convert date variables to DataFrame time values convert_categoricals : boolean, defaults to True Read value labels and convert columns to Categorical/Factor variables index : identifier of index column identifier of column that should be used as index of the DataFrame Returns ------- y : DataFrame instance """ if self._data_read: raise Exception("Data has already been read.") self._data_read = True stata_dta = self._dataset() data = [] for rownum, line in enumerate(stata_dta): # doesn't handle missing value objects, just casts # None will only work without missing value object. for i, val in enumerate(line): #NOTE: This will only be scalar types because missing strings # are empty not None in Stata if val is None: line[i] = np.nan data.append(tuple(line)) if convert_categoricals: self._read_value_labels() data = DataFrame(data, columns=self.varlist, index=index) cols_ = np.where(self.dtyplist)[0] for i in cols_: if self.dtyplist[i] is not None: col = data.columns[i] if data[col].dtype is not np.dtype(object): data[col] = Series(data[col], data[col].index, self.dtyplist[i]) if convert_dates: cols = np.where(lmap(lambda x: x in _date_formats, self.fmtlist))[0] for i in cols: col = data.columns[i] data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(self.fmtlist[i],)) if convert_categoricals: cols = np.where(lmap(lambda x: x in compat.iterkeys(self.value_label_dict), self.lbllist))[0] for i in cols: col = data.columns[i] labeled_data = np.copy(data[col]) labeled_data = labeled_data.astype(object) for k, v in compat.iteritems(self.value_label_dict[self.lbllist[i]]): labeled_data[data[col] == k] = v data[col] = Categorical.from_array(labeled_data) return data
def inner(x): from pandas.core.common import pprint_thing as pp if not x in legal_values: pp_values = lmap(pp, legal_values) raise ValueError("Value must be one of %s" % pp("|".join(pp_values)))
def _parse_raw_thead(self, table): thead = self._parse_thead(table) res = [] if thead: res = lmap(self._text_getter, self._parse_th(thead[0])) return np.array(res).squeeze() if res and len(res) == 1 else res
def _parse_raw_tfoot(self, table): tfoot = self._parse_tfoot(table) res = [] if tfoot: res = lmap(self._text_getter, self._parse_td(tfoot[0])) return np.array(res).squeeze() if res and len(res) == 1 else res
def test_andrews_curves(self, iris): from pandas.plotting import andrews_curves from matplotlib import cm df = iris _check_plot_works(andrews_curves, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=rgba) self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=cnames) self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) length = 10 df = DataFrame({ "A": random.rand(length), "B": random.rand(length), "C": random.rand(length), "Name": ["A"] * length }) _check_plot_works(andrews_curves, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=rgba) self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=cnames) self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) colors = ['b', 'g', 'r'] df = DataFrame({ "A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors }) ax = andrews_curves(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): andrews_curves(data=df, class_column='Name')
def keyfunc(x): import re numeric_tuple = re.sub("[^\d_]_?", "", x).split("_") return lmap(int, numeric_tuple)
def _do_test(df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs['index_col'] = lrange(rnlvl) kwargs['header'] = lrange(cnlvl) with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize, tupleize_cols=False) recons = DataFrame.from_csv(path, tupleize_cols=False, **kwargs) else: kwargs['header'] = 0 with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize) recons = DataFrame.from_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, compat.text_type): return x.decode('utf8') return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [ recons.iloc[:, i].values for i in range(rnlvl - 1) ] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1:] type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O') if r_dtype: if r_dtype == 'u': # unicode r_dtype = 'O' recons.index = np.array(lmap(_to_uni, recons.index), dtype=r_dtype) df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype) elif r_dtype == 'dt': # unicode r_dtype = 'O' recons.index = np.array(lmap(Timestamp, recons.index), dtype=r_dtype) df.index = np.array(lmap(Timestamp, df.index), dtype=r_dtype) elif r_dtype == 'p': r_dtype = 'O' recons.index = np.array(list( map(Timestamp, recons.index.to_datetime())), dtype=r_dtype) df.index = np.array(list( map(Timestamp, df.index.to_datetime())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == 'u': c_dtype = 'O' recons.columns = np.array(lmap(_to_uni, recons.columns), dtype=c_dtype) df.columns = np.array(lmap(_to_uni, df.columns), dtype=c_dtype) elif c_dtype == 'dt': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, recons.columns), dtype=c_dtype) df.columns = np.array(lmap(Timestamp, df.columns), dtype=c_dtype) elif c_dtype == 'p': c_dtype = 'O' recons.columns = np.array(lmap( Timestamp, recons.columns.to_datetime()), dtype=c_dtype) df.columns = np.array(lmap(Timestamp, df.columns.to_datetime()), dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) assert_frame_equal(df, recons, check_names=False, check_less_precise=True)
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) tups = com._asarray_tuplesafe(tups) expected = f(df.groupby(tups)[field]) for k, v in compat.iteritems(expected): assert (result[k] == v)
def _get_standard_colors(num_colors=None, colormap=None, color_type='default', color=None): import matplotlib.pyplot as plt if color is None and colormap is not None: if isinstance(colormap, str): import matplotlib.cm as cm cmap = colormap colormap = cm.get_cmap(colormap) if colormap is None: raise ValueError("Colormap {0} is not recognized".format(cmap)) colors = lmap(colormap, np.linspace(0, 1, num=num_colors)) elif color is not None: if colormap is not None: warnings.warn("'color' and 'colormap' cannot be used " "simultaneously. Using 'color'") colors = list(color) if is_list_like(color) else color else: if color_type == 'default': # need to call list() on the result to copy so we don't # modify the global rcParams below try: colors = [ c['color'] for c in list(plt.rcParams['axes.prop_cycle']) ] except KeyError: colors = list( plt.rcParams.get('axes.color_cycle', list('bgrcmyk'))) if isinstance(colors, str): colors = list(colors) colors = colors[0:num_colors] elif color_type == 'random': import pandas.core.common as com def random_color(column): """ Returns a random color represented as a list of length 3""" # GH17525 use common._random_state to avoid resetting the seed rs = com.random_state(column) return rs.rand(3).tolist() colors = lmap(random_color, lrange(num_colors)) else: raise ValueError("color_type must be either 'default' or 'random'") if isinstance(colors, str): import matplotlib.colors conv = matplotlib.colors.ColorConverter() def _maybe_valid_colors(colors): try: [conv.to_rgba(c) for c in colors] return True except ValueError: return False # check whether the string can be convertible to single color maybe_single_color = _maybe_valid_colors([colors]) # check whether each character can be convertible to colors maybe_color_cycle = _maybe_valid_colors(list(colors)) if maybe_single_color and maybe_color_cycle and len(colors) > 1: hex_color = [ c['color'] for c in list(plt.rcParams['axes.prop_cycle']) ] colors = [hex_color[int(colors[1])]] elif maybe_single_color: colors = [colors] else: # ``colors`` is regarded as color cycle. # mpl will raise error any of them is invalid pass # Append more colors by cycling if there is not enough color. # Extra colors will be ignored by matplotlib if there are more colors # than needed and nothing needs to be done here. if len(colors) < num_colors: try: multiple = num_colors // len(colors) - 1 except ZeroDivisionError: raise ValueError("Invalid color argument: ''") mod = num_colors % len(colors) colors += multiple * colors colors += colors[:mod] return colors
def _get_standard_colors(num_colors=None, colormap=None, color_type='default', color=None): import matplotlib.pyplot as plt if color is None and colormap is not None: if isinstance(colormap, compat.string_types): import matplotlib.cm as cm cmap = colormap colormap = cm.get_cmap(colormap) if colormap is None: raise ValueError("Colormap {0} is not recognized".format(cmap)) colors = lmap(colormap, np.linspace(0, 1, num=num_colors)) elif color is not None: if colormap is not None: warnings.warn("'color' and 'colormap' cannot be used " "simultaneously. Using 'color'") colors = list(color) if is_list_like(color) else color else: if color_type == 'default': # need to call list() on the result to copy so we don't # modify the global rcParams below try: colors = [ c['color'] for c in list(plt.rcParams['axes.prop_cycle']) ] except KeyError: colors = list( plt.rcParams.get('axes.color_cycle', list('bgrcmyk'))) if isinstance(colors, compat.string_types): colors = list(colors) elif color_type == 'random': import pandas.core.common as com def random_color(column): """ Returns a random color represented as a list of length 3""" # GH17525 use common._random_state to avoid resetting the seed rs = com._random_state(column) return rs.rand(3).tolist() colors = lmap(random_color, lrange(num_colors)) else: raise ValueError("color_type must be either 'default' or 'random'") if isinstance(colors, compat.string_types): import matplotlib.colors conv = matplotlib.colors.ColorConverter() def _maybe_valid_colors(colors): try: [conv.to_rgba(c) for c in colors] return True except ValueError: return False # check whether the string can be convertible to single color maybe_single_color = _maybe_valid_colors([colors]) # check whether each character can be convertible to colors maybe_color_cycle = _maybe_valid_colors(list(colors)) if maybe_single_color and maybe_color_cycle and len(colors) > 1: # Special case for single str 'CN' match and convert to hex # for supporting matplotlib < 2.0.0 if re.match(r'\AC[0-9]\Z', colors) and _mpl_ge_2_0_0(): hex_color = [ c['color'] for c in list(plt.rcParams['axes.prop_cycle']) ] colors = [hex_color[int(colors[1])]] else: # this may no longer be required msg = ("'{0}' can be parsed as both single color and " "color cycle. Specify each color using a list " "like ['{0}'] or {1}") raise ValueError(msg.format(colors, list(colors))) elif maybe_single_color: colors = [colors] else: # ``colors`` is regarded as color cycle. # mpl will raise error any of them is invalid pass if len(colors) != num_colors: try: multiple = num_colors // len(colors) - 1 except ZeroDivisionError: raise ValueError("Invalid color argument: ''") mod = num_colors % len(colors) colors += multiple * colors colors += colors[:mod] return colors
def test_iloc_mask(self): # GH 3631, iloc with a mask (of a series) should raise df = DataFrame(lrange(5), list('ABCDE'), columns=['a']) mask = (df.a % 2 == 0) pytest.raises(ValueError, df.iloc.__getitem__, tuple([mask])) mask.index = lrange(len(mask)) pytest.raises(NotImplementedError, df.iloc.__getitem__, tuple([mask])) # ndarray ok result = df.iloc[np.array([True] * len(mask), dtype=bool)] tm.assert_frame_equal(result, df) # the possibilities locs = np.arange(4) nums = 2 ** locs reps = lmap(bin, nums) df = DataFrame({'locs': locs, 'nums': nums}, reps) expected = { (None, ''): '0b1100', (None, '.loc'): '0b1100', (None, '.iloc'): '0b1100', ('index', ''): '0b11', ('index', '.loc'): '0b11', ('index', '.iloc'): ('iLocation based boolean indexing ' 'cannot use an indexable as a mask'), ('locs', ''): 'Unalignable boolean Series provided as indexer ' '(index of the boolean Series and of the indexed ' 'object do not match', ('locs', '.loc'): 'Unalignable boolean Series provided as indexer ' '(index of the boolean Series and of the ' 'indexed object do not match', ('locs', '.iloc'): ('iLocation based boolean indexing on an ' 'integer type is not available'), } # UserWarnings from reindex of a boolean mask with catch_warnings(record=True): result = dict() for idx in [None, 'index', 'locs']: mask = (df.nums > 2).values if idx: mask = Series(mask, list(reversed(getattr(df, idx)))) for method in ['', '.loc', '.iloc']: try: if method: accessor = getattr(df, method[1:]) else: accessor = df ans = str(bin(accessor[mask]['nums'].sum())) except Exception as e: ans = str(e) key = tuple([idx, method]) r = expected.get(key) if r != ans: raise AssertionError( "[%s] does not match [%s], received [%s]" % (key, ans, r))