Ejemplo n.º 1
0
def get_data_famafrench(name):
    # path of zip files
    zip_file_path = '{0}/{1}.zip'.format(_FAMAFRENCH_URL, name)

    with urlopen(zip_file_path) as url:
        raw = url.read()

    with tempfile.TemporaryFile() as tmpf:
        tmpf.write(raw)

        with ZipFile(tmpf, 'r') as zf:
            data = zf.open(zf.namelist()[0]).readlines()

    line_lengths = np.array(lmap(len, data))
    file_edges = np.where(line_lengths == 2)[0]

    datasets = {}
    edges = zip(file_edges + 1, file_edges[1:])
    for i, (left_edge, right_edge) in enumerate(edges):
        dataset = [d.split() for d in data[left_edge:right_edge]]
        if len(dataset) > 10:
            ncol_raw = np.array(lmap(len, dataset))
            ncol = np.median(ncol_raw)
            header_index = np.where(ncol_raw == ncol - 1)[0][-1]
            header = dataset[header_index]
            ds_header = dataset[header_index + 1:]
            # to ensure the header is unique
            header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
                                                                     start=1)]
            index = np.array([d[0] for d in ds_header], dtype=int)
            dataset = np.array([d[1:] for d in ds_header], dtype=float)
            datasets[i] = DataFrame(dataset, index, columns=header)

    return datasets
Ejemplo n.º 2
0
    def test_bar_colors(self):
        import matplotlib.pyplot as plt
        import matplotlib.colors as colors

        default_colors = plt.rcParams.get('axes.color_cycle')
        custom_colors = 'rgcby'

        df = DataFrame(randn(5, 5))
        ax = df.plot(kind='bar')

        rects = ax.patches

        conv = colors.colorConverter
        for i, rect in enumerate(rects[::5]):
            xp = conv.to_rgba(default_colors[i % len(default_colors)])
            rs = rect.get_facecolor()
            self.assertEqual(xp, rs)

        tm.close()

        ax = df.plot(kind='bar', color=custom_colors)

        rects = ax.patches

        conv = colors.colorConverter
        for i, rect in enumerate(rects[::5]):
            xp = conv.to_rgba(custom_colors[i])
            rs = rect.get_facecolor()
            self.assertEqual(xp, rs)

        tm.close()
        from matplotlib import cm

        # Test str -> colormap functionality
        ax = df.plot(kind='bar', colormap='jet')

        rects = ax.patches

        rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5))
        for i, rect in enumerate(rects[::5]):
            xp = rgba_colors[i]
            rs = rect.get_facecolor()
            self.assertEqual(xp, rs)

        tm.close()

        # Test colormap functionality
        ax = df.plot(kind='bar', colormap=cm.jet)

        rects = ax.patches

        rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5))
        for i, rect in enumerate(rects[::5]):
            xp = rgba_colors[i]
            rs = rect.get_facecolor()
            self.assertEqual(xp, rs)

        tm.close()
        df.ix[:, [0]].plot(kind='bar', color='DodgerBlue')
Ejemplo n.º 3
0
    def test_line_colors(self):
        import matplotlib.pyplot as plt
        import sys
        from matplotlib import cm

        custom_colors = 'rgcby'

        plt.close('all')
        df = DataFrame(randn(5, 5))

        ax = df.plot(color=custom_colors)

        lines = ax.get_lines()
        for i, l in enumerate(lines):
            xp = custom_colors[i]
            rs = l.get_color()
            self.assert_(xp == rs)

        tmp = sys.stderr
        sys.stderr = StringIO()
        try:
            plt.close('all')
            ax2 = df.plot(colors=custom_colors)
            lines2 = ax2.get_lines()
            for l1, l2 in zip(lines, lines2):
                self.assert_(l1.get_color(), l2.get_color())
        finally:
            sys.stderr = tmp

        plt.close('all')

        ax = df.plot(colormap='jet')

        rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df)))

        lines = ax.get_lines()
        for i, l in enumerate(lines):
            xp = rgba_colors[i]
            rs = l.get_color()
            self.assert_(xp == rs)

        plt.close('all')

        ax = df.plot(colormap=cm.jet)

        rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df)))

        lines = ax.get_lines()
        for i, l in enumerate(lines):
            xp = rgba_colors[i]
            rs = l.get_color()
            self.assert_(xp == rs)

        # make color a list if plotting one column frame
        # handles cases like df.plot(color='DodgerBlue')
        plt.close('all')
        df.ix[:, [0]].plot(color='DodgerBlue')
Ejemplo n.º 4
0
    def test_to_csv_from_csv3(self):

        with ensure_clean('__tmp_to_csv_from_csv3__') as path:
            df1 = DataFrame(np.random.randn(3, 1))
            df2 = DataFrame(np.random.randn(3, 1))

            df1.to_csv(path)
            df2.to_csv(path, mode='a', header=False)
            xp = pd.concat([df1, df2])
            rs = pd.read_csv(path, index_col=0)
            rs.columns = lmap(int, rs.columns)
            xp.columns = lmap(int, xp.columns)
            assert_frame_equal(xp, rs)
Ejemplo n.º 5
0
    def test_andrews_curves(self):
        from pandas.tools.plotting import andrews_curves
        from matplotlib import cm

        df = self.iris

        _check_plot_works(andrews_curves, frame=df, class_column='Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=rgba)
        self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=cnames)
        self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10])

        ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10])

        length = 10
        df = DataFrame({"A": random.rand(length),
                        "B": random.rand(length),
                        "C": random.rand(length),
                        "Name": ["A"] * length})

        _check_plot_works(andrews_curves, frame=df, class_column='Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=rgba)
        self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=cnames)
        self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10])

        ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10])

        colors = ['b', 'g', 'r']
        df = DataFrame({"A": [1, 2, 3],
                        "B": [1, 2, 3],
                        "C": [1, 2, 3],
                        "Name": colors})
        ax = andrews_curves(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, linecolors=colors)

        with tm.assert_produces_warning(FutureWarning):
            andrews_curves(data=df, class_column='Name')
Ejemplo n.º 6
0
def _expand_elements(body):
    lens = Series(lmap(len, body))
    lens_max = lens.max()
    not_max = lens[lens != lens_max]

    for ind, length in iteritems(not_max):
        body[ind] += [np.nan] * (lens_max - length)
Ejemplo n.º 7
0
def test_split_ranges():
    def _bin(x, width):
        "return int(x) as a base2 string of given width"
        return ''.join(str((x >> i) & 1) for i in range(width - 1, -1, -1))

    def test_locs(mask):
        nfalse = sum(np.array(mask) == 0)

        remaining = 0
        for s, e in com.split_ranges(mask):
            remaining += e - s

            assert 0 not in mask[s:e]

        # make sure the total items covered by the ranges are a complete cover
        assert remaining + nfalse == len(mask)

    # exhaustively test all possible mask sequences of length 8
    ncols = 8
    for i in range(2 ** ncols):
        cols = lmap(int, list(_bin(i, ncols)))  # count up in base2
        mask = [cols[i] == 1 for i in range(len(cols))]
        test_locs(mask)

    # base cases
    test_locs([])
    test_locs([0])
    test_locs([1])
Ejemplo n.º 8
0
Archivo: expr.py Proyecto: Axik/pandas
def _preparse(source, f=compose(_replace_locals, _replace_booleans,
                                _rewrite_assign)):
    """Compose a collection of tokenization functions

    Parameters
    ----------
    source : str
        A Python source code string
    f : callable
        This takes a tuple of (toknum, tokval) as its argument and returns a
        tuple with the same structure but possibly different elements. Defaults
        to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
        ``_replace_locals``.

    Returns
    -------
    s : str
        Valid Python source code

    Notes
    -----
    The `f` parameter can be any callable that takes *and* returns input of the
    form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
    the ``tokenize`` module and ``tokval`` is a string.
    """
    assert callable(f), 'f must be callable'
    return tokenize.untokenize(lmap(f, tokenize_string(source)))
Ejemplo n.º 9
0
    def test_parse_dates_column_list(self):
        from pandas.core.datetools import to_datetime

        data = '''date;destination;ventilationcode;unitcode;units;aux_date
01/01/2010;P;P;50;1;12/1/2011
01/01/2010;P;R;50;1;13/1/2011
15/01/2010;P;P;50;1;14/1/2011
01/05/2010;P;P;50;1;15/1/2011'''

        expected = self.read_csv(StringIO(data), sep=";", index_col=lrange(4))

        lev = expected.index.levels[0]
        levels = list(expected.index.levels)
        levels[0] = lev.to_datetime(dayfirst=True)
        # hack to get this to work - remove for final test
        levels[0].name = lev.name
        expected.index.set_levels(levels, inplace=True)
        expected['aux_date'] = to_datetime(expected['aux_date'],
                                           dayfirst=True)
        expected['aux_date'] = lmap(Timestamp, expected['aux_date'])
        tm.assertIsInstance(expected['aux_date'][0], datetime)

        df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4),
                           parse_dates=[0, 5], dayfirst=True)
        tm.assert_frame_equal(df, expected)

        df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4),
                           parse_dates=['date', 'aux_date'], dayfirst=True)
        tm.assert_frame_equal(df, expected)
Ejemplo n.º 10
0
    def test_radviz(self):
        from pandas.tools.plotting import radviz
        from matplotlib import cm

        df = self.iris
        _check_plot_works(radviz, df, 'Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(radviz, df, 'Name', color=rgba)
        # skip Circle drawn as ticks
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches[:10], facecolors=rgba, mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        _check_plot_works(radviz, df, 'Name', color=cnames)
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10])

        _check_plot_works(radviz, df, 'Name', colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10])

        colors = [[0., 0., 1., 1.],
                  [0., 0.5, 1., 1.],
                  [1., 0., 0., 1.]]
        df = DataFrame({"A": [1, 2, 3],
                        "B": [2, 1, 3],
                        "C": [3, 2, 1],
                        "Name": ['b', 'g', 'r']})
        ax = radviz(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, facecolors=colors)
Ejemplo n.º 11
0
 def _parse_raw_thead(self, table):
     thead = self._parse_thead(table)
     res = []
     if thead:
         res = lmap(self._text_getter, self._parse_th(thead[0]))
     return np.atleast_1d(
         np.array(res).squeeze()) if res and len(res) == 1 else res
Ejemplo n.º 12
0
 def _parse_raw_tfoot(self, table):
     tfoot = self._parse_tfoot(table)
     res = []
     if tfoot:
         res = lmap(self._text_getter, self._parse_td(tfoot[0]))
     return np.atleast_1d(
         np.array(res).squeeze()) if res and len(res) == 1 else res
Ejemplo n.º 13
0
 def test_lmap(self):
     func = lambda x, y, z: x + y + z
     lst = [builtins.range(10), builtins.range(10), builtins.range(10)]
     results = lmap(func, *lst),
     expecteds = list(builtins.map(func, *lst)),
     lengths = 10,
     self.check_results(results, expecteds, lengths)
Ejemplo n.º 14
0
 def _get_label_to_i_dict(labels, sort_labels=False):
     """ Return OrderedDict of unique labels to number.
     Optionally sort by label. """
     labels = Index(lmap(tuple, labels)).unique().tolist()  # squish
     if sort_labels:
         labels = sorted(list(labels))
     d = OrderedDict((k, i) for i, k in enumerate(labels))
     return(d)
Ejemplo n.º 15
0
def _expand_elements(body):
    lens = Series(lmap(len, body))
    lens_max = lens.max()
    not_max = lens[lens != lens_max]

    empty = ['']
    for ind, length in not_max.items():
        body[ind] += empty * (lens_max - length)
Ejemplo n.º 16
0
 def _col_size(self, k=None):
     """Calculate size of a data record."""
     if len(self.col_sizes) == 0:
         self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist)
     if k is None:
         return self.col_sizes
     else:
         return self.col_sizes[k]
Ejemplo n.º 17
0
 def test_map(self):
     func = lambda x, y, z: x + y + z
     lst = [builtins.range(10), builtins.range(10), builtins.range(10)]
     actual1 = map(func, *lst)
     actual2 = lmap(func, *lst)
     actual = [actual1, actual2],
     expected = list(builtins.map(func, *lst)),
     lengths = 10,
     self.check_result(actual, expected, lengths)
Ejemplo n.º 18
0
    def inner(x):
        from pandas.io.formats.printing import pprint_thing as pp
        if x not in legal_values:

            if not any([c(x) for c in callables]):
                pp_values = pp("|".join(lmap(pp, legal_values)))
                msg = "Value must be one of {pp_values}"
                if len(callables):
                    msg += " or a callable"
                raise ValueError(msg.format(pp_values=pp_values))
Ejemplo n.º 19
0
 def _parse_raw_thead(self, table):
     thead = self._parse_thead(table)
     res = []
     if thead:
         trs = self._parse_tr(thead[0])
         for tr in trs:
             cols = lmap(self._text_getter, self._parse_td(tr))
             if any([col != '' for col in cols]):
                 res.append(cols)
     return res
Ejemplo n.º 20
0
def get_results_df(db, rev):
    """Takes a git commit hash and returns a Dataframe of benchmark results
    """
    bench = DataFrame(db.get_benchmarks())
    results = DataFrame(lmap(list,db.get_rev_results(rev).values()))

    # Sinch vbench.db._reg_rev_results returns an unlabeled dict,
    # we have to break encapsulation a bit.
    results.columns = list(db._results.c.keys())
    results = results.join(bench['name'], on='checksum').set_index("checksum")
    return results
Ejemplo n.º 21
0
        def convert_score(x):
            x = x.strip()
            if not x:
                return np.nan
            if x.find('-') > 0:
                valmin, valmax = lmap(int, x.split('-'))
                val = 0.5 * (valmin + valmax)
            else:
                val = float(x)

            return val
Ejemplo n.º 22
0
    def test_iloc_mask(self):

        # GH 3631, iloc with a mask (of a series) should raise
        df = DataFrame(lrange(5), list('ABCDE'), columns=['a'])
        mask = (df.a%2 == 0)
        self.assertRaises(ValueError, df.iloc.__getitem__, tuple([mask]))
        mask.index = lrange(len(mask))
        self.assertRaises(NotImplementedError, df.iloc.__getitem__, tuple([mask]))

        # ndarray ok
        result = df.iloc[np.array([True] * len(mask),dtype=bool)]
        assert_frame_equal(result,df)

        # the possibilities
        locs = np.arange(4)
        nums = 2**locs
        reps = lmap(bin, nums)
        df = DataFrame({'locs':locs, 'nums':nums}, reps)

        expected = {
            (None,'')     : '0b1100',
            (None,'.loc')  : '0b1100',
            (None,'.iloc') : '0b1100',
            ('index','')  : '0b11',
            ('index','.loc')  : '0b11',
            ('index','.iloc') : 'iLocation based boolean indexing cannot use an indexable as a mask',
            ('locs','')      : 'Unalignable boolean Series key provided',
            ('locs','.loc')   : 'Unalignable boolean Series key provided',
            ('locs','.iloc')  : 'iLocation based boolean indexing on an integer type is not available',
            }

        import warnings
        warnings.filterwarnings(action='ignore', category=UserWarning)
        result = dict()
        for idx in [None, 'index', 'locs']:
            mask = (df.nums>2).values
            if idx:
                mask = Series(mask, list(reversed(getattr(df, idx))))
            for method in ['', '.loc', '.iloc']:
                try:
                    if method:
                        accessor = getattr(df, method[1:])
                    else:
                        accessor = df
                    ans = str(bin(accessor[mask]['nums'].sum()))
                except Exception as e:
                    ans = str(e)

                key = tuple([idx,method])
                r = expected.get(key)
                if r != ans:
                    raise AssertionError("[%s] does not match [%s], received [%s]" %
                                         (key,ans,r))
        warnings.filterwarnings(action='always', category=UserWarning)
    def _get_one(self, name, *args, **kwargs):
        url = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/{name}.zip'\
            .format(name=name)

        response = self.session.get(url)
        raw = response.content # returns bytes (.text returns unicode ; .content returns byte)
        if response.status_code!=200:
            raise IOError("Failed to get the data. Check that {0!r} is "
                          "a valid FamaFrench dataset.".format(name))

        with tempfile.TemporaryFile() as tmpf:
            tmpf.write(raw)

            with ZipFile(tmpf, 'r') as zf:
                data = zf.open(zf.namelist()[0]).readlines()

        line_lengths = np.array(lmap(len, data))
        file_edges = np.where(line_lengths == 2)[0]

        datasets = {}
        edges = zip(file_edges + 1, file_edges[1:])
        for i, (left_edge, right_edge) in enumerate(edges):
            dataset = [d.split() for d in data[left_edge:right_edge]]
            if len(dataset) > 10:
                ncol_raw = np.array(lmap(len, dataset))
                ncol = np.median(ncol_raw)
                header_index = np.where(ncol_raw == ncol - 1)[0][-1]
                header = dataset[header_index]
                ds_header = dataset[header_index + 1:]
                # to ensure the header is unique
                header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
                                                                         start=1)]
                index = np.array([d[0] for d in ds_header], dtype=int)
                dataset = np.array([d[1:] for d in ds_header], dtype=float)
                datasets[i] = pd.DataFrame(dataset, index, columns=header)

        return datasets        
Ejemplo n.º 24
0
 def test_plot_single_color(self):
     # Example from #20585. All 3 bars should have the same color
     df = DataFrame({'account-start': ['2017-02-03', '2017-03-03',
                                       '2017-01-01'],
                     'client': ['Alice Anders', 'Bob Baker',
                                'Charlie Chaplin'],
                     'balance': [-1432.32, 10.43, 30000.00],
                     'db-id': [1234, 2424, 251],
                     'proxy-id': [525, 1525, 2542],
                     'rank': [52, 525, 32],
                     })
     ax = df.client.value_counts().plot.bar()
     colors = lmap(lambda rect: rect.get_facecolor(),
                   ax.get_children()[0:3])
     assert all(color == colors[0] for color in colors)
Ejemplo n.º 25
0
    def test_map_with_string_constructor(self):
        raw = [2005, 2007, 2009]
        index = PeriodIndex(raw, freq='A')

        expected = Index(lmap(str, raw))
        res = index.map(str)

        # should return an Index
        assert isinstance(res, Index)

        # preserve element types
        assert all(isinstance(resi, str) for resi in res)

        # lastly, values should compare equal
        tm.assert_index_equal(res, expected)
Ejemplo n.º 26
0
def compute_autocorrelation(data):
    # from pandas.tools.plotting import autocorrelation_plot
    # see http://pandas.pydata.org/pandas-docs/dev/visualization.html#autocorrelation-plot
    # see http://www.itl.nist.gov/div898/handbook/eda/section3/autocopl.htm

    from pandas.compat import lmap

    n = len(data)
    mean = np.mean(data)
    c0 = np.sum((data - mean) ** 2) / float(n)

    def r(h):
        return ((data[:n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0
    x = np.arange(n) + 1
    y = lmap(r, x)
    return np.asarray(y, dtype=np.float32)
Ejemplo n.º 27
0
    def applymap(self, func):
        """
        Apply a function to a DataFrame that is intended to operate
        elementwise, i.e. like doing map(func, series) for each series in the
        DataFrame

        Parameters
        ----------
        func : function
            Python function, returns a single value from a single value

        Returns
        -------
        applied : DataFrame
        """
        return self.apply(lambda x: lmap(func, x))
Ejemplo n.º 28
0
    def test_parallel_coordinates(self):
        from pandas.plotting import parallel_coordinates
        from matplotlib import cm

        df = self.iris

        ax = _check_plot_works(parallel_coordinates,
                               frame=df, class_column='Name')
        nlines = len(ax.get_lines())
        nxticks = len(ax.xaxis.get_ticklabels())

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(parallel_coordinates,
                               frame=df, class_column='Name', color=rgba)
        self._check_colors(
            ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        ax = _check_plot_works(parallel_coordinates,
                               frame=df, class_column='Name', color=cnames)
        self._check_colors(
            ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10])

        ax = _check_plot_works(parallel_coordinates,
                               frame=df, class_column='Name', colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        self._check_colors(
            ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10])

        ax = _check_plot_works(parallel_coordinates,
                               frame=df, class_column='Name', axvlines=False)
        assert len(ax.get_lines()) == (nlines - nxticks)

        colors = ['b', 'g', 'r']
        df = DataFrame({"A": [1, 2, 3],
                        "B": [1, 2, 3],
                        "C": [1, 2, 3],
                        "Name": colors})
        ax = parallel_coordinates(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, linecolors=colors)

        with tm.assert_produces_warning(FutureWarning):
            parallel_coordinates(data=df, class_column='Name')
        with tm.assert_produces_warning(FutureWarning):
            parallel_coordinates(df, 'Name', colors=colors)
Ejemplo n.º 29
0
def to_arrays(data, columns, coerce_float=False, dtype=None):
    """
    Return list of arrays, columns.
    """
    if isinstance(data, ABCDataFrame):
        if columns is not None:
            arrays = [data._ixs(i, axis=1).values
                      for i, col in enumerate(data.columns) if col in columns]
        else:
            columns = data.columns
            arrays = [data._ixs(i, axis=1).values for i in range(len(columns))]

        return arrays, columns

    if not len(data):
        if isinstance(data, np.ndarray):
            columns = data.dtype.names
            if columns is not None:
                return [[]] * len(columns), columns
        return [], []  # columns if columns is not None else []
    if isinstance(data[0], (list, tuple)):
        return _list_to_arrays(data, columns, coerce_float=coerce_float,
                               dtype=dtype)
    elif isinstance(data[0], compat.Mapping):
        return _list_of_dict_to_arrays(data, columns,
                                       coerce_float=coerce_float, dtype=dtype)
    elif isinstance(data[0], ABCSeries):
        return _list_of_series_to_arrays(data, columns,
                                         coerce_float=coerce_float,
                                         dtype=dtype)
    elif isinstance(data[0], Categorical):
        if columns is None:
            columns = ibase.default_index(len(data))
        return data, columns
    elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and
          data.dtype.names is not None):

        columns = list(data.dtype.names)
        arrays = [data[k] for k in columns]
        return arrays, columns
    else:
        # last ditch effort
        data = lmap(tuple, data)
        return _list_to_arrays(data, columns, coerce_float=coerce_float,
                               dtype=dtype)
Ejemplo n.º 30
0
def autocorrelation_plot(series, ax=None, **kwds):
    """
    Autocorrelation plot for time series.

    Parameters:
    -----------
    series: Time series
    ax: Matplotlib axis object, optional
    kwds : keywords
        Options to pass to matplotlib plotting method

    Returns:
    -----------
    class:`matplotlib.axis.Axes`
    """
    import matplotlib.pyplot as plt
    n = len(series)
    data = np.asarray(series)
    if ax is None:
        ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0))
    mean = np.mean(data)
    c0 = np.sum((data - mean) ** 2) / float(n)

    def r(h):
        return ((data[:n - h] - mean) *
                (data[h:] - mean)).sum() / float(n) / c0
    x = np.arange(n) + 1
    y = lmap(r, x)
    z95 = 1.959963984540054
    z99 = 2.5758293035489004
    ax.axhline(y=z99 / np.sqrt(n), linestyle='--', color='grey')
    ax.axhline(y=z95 / np.sqrt(n), color='grey')
    ax.axhline(y=0.0, color='black')
    ax.axhline(y=-z95 / np.sqrt(n), color='grey')
    ax.axhline(y=-z99 / np.sqrt(n), linestyle='--', color='grey')
    ax.set_xlabel("Lag")
    ax.set_ylabel("Autocorrelation")
    ax.plot(x, y, **kwds)
    if 'label' in kwds:
        ax.legend()
    ax.grid()
    return ax
Ejemplo n.º 31
0
    def data(self, convert_dates=True, convert_categoricals=True, index=None):
        """
        Reads observations from Stata file, converting them into a dataframe

        Parameters
        ----------
        convert_dates : boolean, defaults to True
            Convert date variables to DataFrame time values
        convert_categoricals : boolean, defaults to True
            Read value labels and convert columns to Categorical/Factor variables
        index : identifier of index column
            identifier of column that should be used as index of the DataFrame

        Returns
        -------
        y : DataFrame instance
        """
        if self._data_read:
            raise Exception("Data has already been read.")
        self._data_read = True

        stata_dta = self._dataset()

        data = []
        for rownum, line in enumerate(stata_dta):
            # doesn't handle missing value objects, just casts
            # None will only work without missing value object.
            for i, val in enumerate(line):
                #NOTE: This will only be scalar types because missing strings
                # are empty not None in Stata
                if val is None:
                    line[i] = np.nan
            data.append(tuple(line))

        if convert_categoricals:
            self._read_value_labels()

        data = DataFrame(data, columns=self.varlist, index=index)

        cols_ = np.where(self.dtyplist)[0]
        for i in cols_:
            if self.dtyplist[i] is not None:
                col = data.columns[i]
                if data[col].dtype is not np.dtype(object):
                    data[col] = Series(data[col], data[col].index, self.dtyplist[i])

        if convert_dates:
            cols = np.where(lmap(lambda x: x in _date_formats, self.fmtlist))[0]
            for i in cols:
                col = data.columns[i]
                data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(self.fmtlist[i],))

        if convert_categoricals:
            cols = np.where(lmap(lambda x: x in compat.iterkeys(self.value_label_dict), self.lbllist))[0]
            for i in cols:
                col = data.columns[i]
                labeled_data = np.copy(data[col])
                labeled_data = labeled_data.astype(object)
                for k, v in compat.iteritems(self.value_label_dict[self.lbllist[i]]):
                    labeled_data[data[col] == k] = v
                data[col] = Categorical.from_array(labeled_data)

        return data
Ejemplo n.º 32
0
 def inner(x):
     from pandas.core.common import pprint_thing as pp
     if not x in legal_values:
         pp_values = lmap(pp, legal_values)
         raise ValueError("Value must be one of %s" %
                          pp("|".join(pp_values)))
Ejemplo n.º 33
0
 def _parse_raw_thead(self, table):
     thead = self._parse_thead(table)
     res = []
     if thead:
         res = lmap(self._text_getter, self._parse_th(thead[0]))
     return np.array(res).squeeze() if res and len(res) == 1 else res
Ejemplo n.º 34
0
 def _parse_raw_tfoot(self, table):
     tfoot = self._parse_tfoot(table)
     res = []
     if tfoot:
         res = lmap(self._text_getter, self._parse_td(tfoot[0]))
     return np.array(res).squeeze() if res and len(res) == 1 else res
Ejemplo n.º 35
0
    def test_andrews_curves(self, iris):
        from pandas.plotting import andrews_curves
        from matplotlib import cm

        df = iris

        _check_plot_works(andrews_curves, frame=df, class_column='Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(andrews_curves,
                               frame=df,
                               class_column='Name',
                               color=rgba)
        self._check_colors(ax.get_lines()[:10],
                           linecolors=rgba,
                           mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        ax = _check_plot_works(andrews_curves,
                               frame=df,
                               class_column='Name',
                               color=cnames)
        self._check_colors(ax.get_lines()[:10],
                           linecolors=cnames,
                           mapping=df['Name'][:10])

        ax = _check_plot_works(andrews_curves,
                               frame=df,
                               class_column='Name',
                               colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        self._check_colors(ax.get_lines()[:10],
                           linecolors=cmaps,
                           mapping=df['Name'][:10])

        length = 10
        df = DataFrame({
            "A": random.rand(length),
            "B": random.rand(length),
            "C": random.rand(length),
            "Name": ["A"] * length
        })

        _check_plot_works(andrews_curves, frame=df, class_column='Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(andrews_curves,
                               frame=df,
                               class_column='Name',
                               color=rgba)
        self._check_colors(ax.get_lines()[:10],
                           linecolors=rgba,
                           mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        ax = _check_plot_works(andrews_curves,
                               frame=df,
                               class_column='Name',
                               color=cnames)
        self._check_colors(ax.get_lines()[:10],
                           linecolors=cnames,
                           mapping=df['Name'][:10])

        ax = _check_plot_works(andrews_curves,
                               frame=df,
                               class_column='Name',
                               colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        self._check_colors(ax.get_lines()[:10],
                           linecolors=cmaps,
                           mapping=df['Name'][:10])

        colors = ['b', 'g', 'r']
        df = DataFrame({
            "A": [1, 2, 3],
            "B": [1, 2, 3],
            "C": [1, 2, 3],
            "Name": colors
        })
        ax = andrews_curves(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, linecolors=colors)

        with tm.assert_produces_warning(FutureWarning):
            andrews_curves(data=df, class_column='Name')
Ejemplo n.º 36
0
 def keyfunc(x):
     import re
     numeric_tuple = re.sub("[^\d_]_?", "", x).split("_")
     return lmap(int, numeric_tuple)
Ejemplo n.º 37
0
        def _do_test(df,
                     r_dtype=None,
                     c_dtype=None,
                     rnlvl=None,
                     cnlvl=None,
                     dupe_col=False):

            kwargs = dict(parse_dates=False)
            if cnlvl:
                if rnlvl is not None:
                    kwargs['index_col'] = lrange(rnlvl)
                kwargs['header'] = lrange(cnlvl)
                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path,
                              encoding='utf8',
                              chunksize=chunksize,
                              tupleize_cols=False)
                    recons = DataFrame.from_csv(path,
                                                tupleize_cols=False,
                                                **kwargs)
            else:
                kwargs['header'] = 0
                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path, encoding='utf8', chunksize=chunksize)
                    recons = DataFrame.from_csv(path, **kwargs)

            def _to_uni(x):
                if not isinstance(x, compat.text_type):
                    return x.decode('utf8')
                return x

            if dupe_col:
                # read_Csv disambiguates the columns by
                # labeling them dupe.1,dupe.2, etc'. monkey patch columns
                recons.columns = df.columns
            if rnlvl and not cnlvl:
                delta_lvl = [
                    recons.iloc[:, i].values for i in range(rnlvl - 1)
                ]
                ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
                recons.index = ix
                recons = recons.iloc[:, rnlvl - 1:]

            type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O')
            if r_dtype:
                if r_dtype == 'u':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(_to_uni, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype)
                elif r_dtype == 'dt':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(Timestamp, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(Timestamp, df.index),
                                        dtype=r_dtype)
                elif r_dtype == 'p':
                    r_dtype = 'O'
                    recons.index = np.array(list(
                        map(Timestamp, recons.index.to_datetime())),
                                            dtype=r_dtype)
                    df.index = np.array(list(
                        map(Timestamp, df.index.to_datetime())),
                                        dtype=r_dtype)
                else:
                    r_dtype = type_map.get(r_dtype)
                    recons.index = np.array(recons.index, dtype=r_dtype)
                    df.index = np.array(df.index, dtype=r_dtype)
            if c_dtype:
                if c_dtype == 'u':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(_to_uni, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(_to_uni, df.columns),
                                          dtype=c_dtype)
                elif c_dtype == 'dt':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(Timestamp, df.columns),
                                          dtype=c_dtype)
                elif c_dtype == 'p':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(
                        Timestamp, recons.columns.to_datetime()),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(Timestamp,
                                               df.columns.to_datetime()),
                                          dtype=c_dtype)
                else:
                    c_dtype = type_map.get(c_dtype)
                    recons.columns = np.array(recons.columns, dtype=c_dtype)
                    df.columns = np.array(df.columns, dtype=c_dtype)

            assert_frame_equal(df,
                               recons,
                               check_names=False,
                               check_less_precise=True)
Ejemplo n.º 38
0
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
    tups = lmap(tuple, df[keys].values)
    tups = com._asarray_tuplesafe(tups)
    expected = f(df.groupby(tups)[field])
    for k, v in compat.iteritems(expected):
        assert (result[k] == v)
Ejemplo n.º 39
0
def _get_standard_colors(num_colors=None,
                         colormap=None,
                         color_type='default',
                         color=None):
    import matplotlib.pyplot as plt

    if color is None and colormap is not None:
        if isinstance(colormap, str):
            import matplotlib.cm as cm
            cmap = colormap
            colormap = cm.get_cmap(colormap)
            if colormap is None:
                raise ValueError("Colormap {0} is not recognized".format(cmap))
        colors = lmap(colormap, np.linspace(0, 1, num=num_colors))
    elif color is not None:
        if colormap is not None:
            warnings.warn("'color' and 'colormap' cannot be used "
                          "simultaneously. Using 'color'")
        colors = list(color) if is_list_like(color) else color
    else:
        if color_type == 'default':
            # need to call list() on the result to copy so we don't
            # modify the global rcParams below
            try:
                colors = [
                    c['color'] for c in list(plt.rcParams['axes.prop_cycle'])
                ]
            except KeyError:
                colors = list(
                    plt.rcParams.get('axes.color_cycle', list('bgrcmyk')))
            if isinstance(colors, str):
                colors = list(colors)

            colors = colors[0:num_colors]
        elif color_type == 'random':
            import pandas.core.common as com

            def random_color(column):
                """ Returns a random color represented as a list of length 3"""
                # GH17525 use common._random_state to avoid resetting the seed
                rs = com.random_state(column)
                return rs.rand(3).tolist()

            colors = lmap(random_color, lrange(num_colors))
        else:
            raise ValueError("color_type must be either 'default' or 'random'")

    if isinstance(colors, str):
        import matplotlib.colors
        conv = matplotlib.colors.ColorConverter()

        def _maybe_valid_colors(colors):
            try:
                [conv.to_rgba(c) for c in colors]
                return True
            except ValueError:
                return False

        # check whether the string can be convertible to single color
        maybe_single_color = _maybe_valid_colors([colors])
        # check whether each character can be convertible to colors
        maybe_color_cycle = _maybe_valid_colors(list(colors))
        if maybe_single_color and maybe_color_cycle and len(colors) > 1:
            hex_color = [
                c['color'] for c in list(plt.rcParams['axes.prop_cycle'])
            ]
            colors = [hex_color[int(colors[1])]]
        elif maybe_single_color:
            colors = [colors]
        else:
            # ``colors`` is regarded as color cycle.
            # mpl will raise error any of them is invalid
            pass

    # Append more colors by cycling if there is not enough color.
    # Extra colors will be ignored by matplotlib if there are more colors
    # than needed and nothing needs to be done here.
    if len(colors) < num_colors:
        try:
            multiple = num_colors // len(colors) - 1
        except ZeroDivisionError:
            raise ValueError("Invalid color argument: ''")
        mod = num_colors % len(colors)

        colors += multiple * colors
        colors += colors[:mod]

    return colors
Ejemplo n.º 40
0
def _get_standard_colors(num_colors=None,
                         colormap=None,
                         color_type='default',
                         color=None):
    import matplotlib.pyplot as plt

    if color is None and colormap is not None:
        if isinstance(colormap, compat.string_types):
            import matplotlib.cm as cm
            cmap = colormap
            colormap = cm.get_cmap(colormap)
            if colormap is None:
                raise ValueError("Colormap {0} is not recognized".format(cmap))
        colors = lmap(colormap, np.linspace(0, 1, num=num_colors))
    elif color is not None:
        if colormap is not None:
            warnings.warn("'color' and 'colormap' cannot be used "
                          "simultaneously. Using 'color'")
        colors = list(color) if is_list_like(color) else color
    else:
        if color_type == 'default':
            # need to call list() on the result to copy so we don't
            # modify the global rcParams below
            try:
                colors = [
                    c['color'] for c in list(plt.rcParams['axes.prop_cycle'])
                ]
            except KeyError:
                colors = list(
                    plt.rcParams.get('axes.color_cycle', list('bgrcmyk')))
            if isinstance(colors, compat.string_types):
                colors = list(colors)
        elif color_type == 'random':
            import pandas.core.common as com

            def random_color(column):
                """ Returns a random color represented as a list of length 3"""
                # GH17525 use common._random_state to avoid resetting the seed
                rs = com._random_state(column)
                return rs.rand(3).tolist()

            colors = lmap(random_color, lrange(num_colors))
        else:
            raise ValueError("color_type must be either 'default' or 'random'")

    if isinstance(colors, compat.string_types):
        import matplotlib.colors
        conv = matplotlib.colors.ColorConverter()

        def _maybe_valid_colors(colors):
            try:
                [conv.to_rgba(c) for c in colors]
                return True
            except ValueError:
                return False

        # check whether the string can be convertible to single color
        maybe_single_color = _maybe_valid_colors([colors])
        # check whether each character can be convertible to colors
        maybe_color_cycle = _maybe_valid_colors(list(colors))
        if maybe_single_color and maybe_color_cycle and len(colors) > 1:
            # Special case for single str 'CN' match and convert to hex
            # for supporting matplotlib < 2.0.0
            if re.match(r'\AC[0-9]\Z', colors) and _mpl_ge_2_0_0():
                hex_color = [
                    c['color'] for c in list(plt.rcParams['axes.prop_cycle'])
                ]
                colors = [hex_color[int(colors[1])]]
            else:
                # this may no longer be required
                msg = ("'{0}' can be parsed as both single color and "
                       "color cycle. Specify each color using a list "
                       "like ['{0}'] or {1}")
                raise ValueError(msg.format(colors, list(colors)))
        elif maybe_single_color:
            colors = [colors]
        else:
            # ``colors`` is regarded as color cycle.
            # mpl will raise error any of them is invalid
            pass

    if len(colors) != num_colors:
        try:
            multiple = num_colors // len(colors) - 1
        except ZeroDivisionError:
            raise ValueError("Invalid color argument: ''")
        mod = num_colors % len(colors)

        colors += multiple * colors
        colors += colors[:mod]

    return colors
Ejemplo n.º 41
0
    def test_iloc_mask(self):

        # GH 3631, iloc with a mask (of a series) should raise
        df = DataFrame(lrange(5), list('ABCDE'), columns=['a'])
        mask = (df.a % 2 == 0)
        pytest.raises(ValueError, df.iloc.__getitem__, tuple([mask]))
        mask.index = lrange(len(mask))
        pytest.raises(NotImplementedError, df.iloc.__getitem__,
                      tuple([mask]))

        # ndarray ok
        result = df.iloc[np.array([True] * len(mask), dtype=bool)]
        tm.assert_frame_equal(result, df)

        # the possibilities
        locs = np.arange(4)
        nums = 2 ** locs
        reps = lmap(bin, nums)
        df = DataFrame({'locs': locs, 'nums': nums}, reps)

        expected = {
            (None, ''): '0b1100',
            (None, '.loc'): '0b1100',
            (None, '.iloc'): '0b1100',
            ('index', ''): '0b11',
            ('index', '.loc'): '0b11',
            ('index', '.iloc'): ('iLocation based boolean indexing '
                                 'cannot use an indexable as a mask'),
            ('locs', ''): 'Unalignable boolean Series provided as indexer '
                          '(index of the boolean Series and of the indexed '
                          'object do not match',
            ('locs', '.loc'): 'Unalignable boolean Series provided as indexer '
                              '(index of the boolean Series and of the '
                              'indexed object do not match',
            ('locs', '.iloc'): ('iLocation based boolean indexing on an '
                                'integer type is not available'),
        }

        # UserWarnings from reindex of a boolean mask
        with catch_warnings(record=True):
            result = dict()
            for idx in [None, 'index', 'locs']:
                mask = (df.nums > 2).values
                if idx:
                    mask = Series(mask, list(reversed(getattr(df, idx))))
                for method in ['', '.loc', '.iloc']:
                    try:
                        if method:
                            accessor = getattr(df, method[1:])
                        else:
                            accessor = df
                        ans = str(bin(accessor[mask]['nums'].sum()))
                    except Exception as e:
                        ans = str(e)

                    key = tuple([idx, method])
                    r = expected.get(key)
                    if r != ans:
                        raise AssertionError(
                            "[%s] does not match [%s], received [%s]"
                            % (key, ans, r))