Example #1
0
def maybe_read_encoded_stream(reader, encoding=None):
    """ read an encoded stream from the reader and transform the bytes to unicode
        if required based on the encoding

        Parameters
        ----------
        reader : a streamable file-like object
        encoding : optional, the encoding to attempt to read

        Returns
        -------
        a tuple of (a stream of decoded bytes, the encoding which was used)

        """

    if compat.PY3 or encoding is not None:  # pragma: no cover
        if encoding:
            errors = 'strict'
        else:
            errors = 'replace'
            encoding = 'utf-8'
        reader = StringIO(reader.read().decode(encoding, errors))
    else:
        encoding = None
    return reader, encoding
Example #2
0
    def test_to_html(self):
        # big mixed
        biggie = DataFrame({'A': np.random.randn(200),
                            'B': tm.makeStringIndex(200)},
                           index=lrange(200))

        biggie.loc[:20, 'A'] = np.nan
        biggie.loc[:20, 'B'] = np.nan
        s = biggie.to_html()

        buf = StringIO()
        retval = biggie.to_html(buf=buf)
        assert retval is None
        assert buf.getvalue() == s

        assert isinstance(s, compat.string_types)

        biggie.to_html(columns=['B', 'A'], col_space=17)
        biggie.to_html(columns=['B', 'A'],
                       formatters={'A': lambda x: '{x:.1f}'.format(x=x)})

        biggie.to_html(columns=['B', 'A'], float_format=str)
        biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str)

        frame = DataFrame(index=np.arange(200))
        frame.to_html()
Example #3
0
    def test_to_csv_quoting(self):
        df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

        buf = StringIO()
        df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC)

        result = buf.getvalue()
        expected = ('"A","B"\n'
                    '1,"foo"\n'
                    '2,"bar"\n'
                    '3,"baz"\n')

        self.assertEqual(result, expected)

        # quoting windows line terminators, presents with encoding?
        # #3503
        text = 'a,b,c\n1,"test \r\n",3\n'
        df = pd.read_csv(StringIO(text))
        buf = StringIO()
        df.to_csv(buf, encoding='utf-8', index=False)
        self.assertEqual(buf.getvalue(), text)

        # testing if quoting parameter is passed through with multi-indexes
        # related to issue #7791
        df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
        df = df.set_index(['a', 'b'])
        expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n'
        self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected)
Example #4
0
 def test_to_csv_stringio(self):
     buf = StringIO()
     self.frame.to_csv(buf)
     buf.seek(0)
     recons = read_csv(buf, index_col=0)
     # TODO to_csv drops column name
     assert_frame_equal(recons, self.frame, check_names=False)
Example #5
0
    def test_to_html(self):
        # big mixed
        biggie = DataFrame({'A': np.random.randn(200),
                            'B': tm.makeStringIndex(200)},
                           index=lrange(200))

        biggie.loc[:20, 'A'] = np.nan
        biggie.loc[:20, 'B'] = np.nan
        s = biggie.to_html()

        buf = StringIO()
        retval = biggie.to_html(buf=buf)
        self.assertIsNone(retval)
        self.assertEqual(buf.getvalue(), s)

        tm.assertIsInstance(s, compat.string_types)

        biggie.to_html(columns=['B', 'A'], col_space=17)
        biggie.to_html(columns=['B', 'A'],
                       formatters={'A': lambda x: '%.1f' % x})

        biggie.to_html(columns=['B', 'A'], float_format=str)
        biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str)

        frame = DataFrame(index=np.arange(200))
        frame.to_html()
Example #6
0
    def _read_one_data(self, ftppath, params):

        if re.search(_ZIP_RE, ftppath) is not None:
            index_file = self._read_zipfile(ftppath)
        elif re.search(_GZ_RE, ftppath) is not None:
            index_file = self._read_gzfile(ftppath)
        else:
            index_file = StringIO()
            index_list = []
            try:
                self._sec_ftp_session.retrlines('RETR ' + ftppath,
                                                index_list.append)
            except EOFError:
                raise RemoteDataError('FTP server has closed the connection.')

            for line in index_list:
                index_file.write(line + '\n')
            index_file.seek(0)

        index_file = self._remove_header(index_file)
        index = read_csv(index_file, delimiter='|', header=None,
                         index_col=False, names=_COLUMNS,
                         low_memory=False, dtype=_COLUMN_TYPES)
        index['filename'] = index['filename'].map(self._fix_old_file_paths)
        return index
Example #7
0
    def test_to_csv_numpy_16_bug(self):
        frame = DataFrame({'a': date_range('1/1/2000', periods=10)})

        buf = StringIO()
        frame.to_csv(buf)

        result = buf.getvalue()
        self.assertIn('2000-01-01', result)
Example #8
0
def to_clipboard(obj, excel=True, sep=None, **kwargs):  # pragma: no cover
    """
    Attempt to write text representation of object to the system clipboard
    The clipboard can be then pasted into Excel for example.

    Parameters
    ----------
    obj : the object to write to the clipboard
    excel : boolean, defaults to True
            if True, use the provided separator, writing in a csv
            format for allowing easy pasting into excel.
            if False, write a string representation of the object
            to the clipboard
    sep : optional, defaults to tab
    other keywords are passed to to_csv

    Notes
    -----
    Requirements for your platform
      - Linux: xclip, or xsel (with gtk or PyQt4 modules)
      - Windows:
      - OS X:
    """
    encoding = kwargs.pop('encoding', 'utf-8')

    # testing if an invalid encoding is passed to clipboard
    if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
        raise ValueError('clipboard only supports utf-8 encoding')

    from pandas.io.clipboard import clipboard_set
    if excel is None:
        excel = True

    if excel:
        try:
            if sep is None:
                sep = '\t'
            buf = StringIO()
            # clipboard_set (pyperclip) expects unicode
            obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs)
            text = buf.getvalue()
            if PY2:
                text = text.decode('utf-8')
            clipboard_set(text)
            return
        except TypeError:
            warnings.warn('to_clipboard in excel mode requires a single '
                          'character separator.')
    elif sep is not None:
        warnings.warn('to_clipboard with excel=False ignores the sep argument')

    if isinstance(obj, ABCDataFrame):
        # str(df) has various unhelpful defaults, like truncation
        with option_context('display.max_colwidth', 999999):
            objstr = obj.to_string(**kwargs)
    else:
        objstr = str(obj)
    clipboard_set(objstr)
Example #9
0
    def test_to_csv_unicode_index(self):
        buf = StringIO()
        s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")])

        s.to_csv(buf, encoding="UTF-8")
        buf.seek(0)

        s2 = self.read_csv(buf, index_col=0, encoding="UTF-8")
        assert_series_equal(s, s2)
Example #10
0
    def test_info_duplicate_columns_shows_correct_dtypes(self):
        # GH11761
        io = StringIO()

        frame = DataFrame([[1, 2.0]], columns=["a", "a"])
        frame.info(buf=io)
        io.seek(0)
        lines = io.readlines()
        self.assertEqual("a    1 non-null int64\n", lines[3])
        self.assertEqual("a    1 non-null float64\n", lines[4])
Example #11
0
 def test_to_csv_quote_none(self):
     # GH4328
     df = DataFrame({'A': ['hello', '{"hello"}']})
     for encoding in (None, 'utf-8'):
         buf = StringIO()
         df.to_csv(buf, quoting=csv.QUOTE_NONE,
                   encoding=encoding, index=False)
         result = buf.getvalue()
         expected = 'A\nhello\n{"hello"}\n'
         self.assertEqual(result, expected)
Example #12
0
    def test_info_duplicate_columns_shows_correct_dtypes(self):
        # GH11761
        io = StringIO()

        frame = DataFrame([[1, 2.0]],
                          columns=['a', 'a'])
        frame.info(buf=io)
        io.seek(0)
        lines = io.readlines()
        assert 'a    1 non-null int64\n' == lines[3]
        assert 'a    1 non-null float64\n' == lines[4]
Example #13
0
    def test_to_csv_index_no_leading_comma(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                       index=['one', 'two', 'three'])

        buf = StringIO()
        df.to_csv(buf, index_label=False)
        expected = ('A,B\n'
                    'one,1,4\n'
                    'two,2,5\n'
                    'three,3,6\n')
        self.assertEqual(buf.getvalue(), expected)
Example #14
0
    def _remove_header(self, data):
        header = True
        cleaned_datafile = StringIO()
        for line in data:
            if header is False:
                cleaned_datafile.write(line + '\n')
            elif re.search(_DIVIDER, line) is not None:
                header = False

        cleaned_datafile.seek(0)
        return cleaned_datafile
Example #15
0
def test_to_csv_gcs(mock):
    df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
                     'dt': date_range('2018-06-18', periods=2)})
    with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem:
        s = StringIO()
        instance = MockFileSystem.return_value
        instance.open.return_value = s

        df1.to_csv('gs://test/test.csv', index=True)
        df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0)

    assert_frame_equal(df1, df2)
Example #16
0
    def test_constructor_bad_file(self):
        non_file = StringIO('I am not a file')
        non_file.fileno = lambda: -1

        msg = "Invalid argument"
        tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file)

        target = open(self.mmap_file, 'r')
        target.close()

        msg = "I/O operation on closed file"
        tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target)
Example #17
0
 def test_info_shows_column_dtypes(self):
     dtypes = ["int64", "float64", "datetime64[ns]", "timedelta64[ns]", "complex128", "object", "bool"]
     data = {}
     n = 10
     for i, dtype in enumerate(dtypes):
         data[i] = np.random.randint(2, size=n).astype(dtype)
     df = DataFrame(data)
     buf = StringIO()
     df.info(buf=buf)
     res = buf.getvalue()
     for i, dtype in enumerate(dtypes):
         name = "%d    %d non-null %s" % (i, n, dtype)
         assert name in res
Example #18
0
    def test_to_csv_index_no_leading_comma(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                       index=['one', 'two', 'three'])

        buf = StringIO()
        df.to_csv(buf, index_label=False)

        expected_rows = ['A,B',
                         'one,1,4',
                         'two,2,5',
                         'three,3,6']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert buf.getvalue() == expected
Example #19
0
    def test_to_csv_unicode_index_col(self):
        buf = StringIO('')
        df = DataFrame(
            [[u("\u05d0"), "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]],
            columns=[u("\u05d0"),
                     u("\u05d1"), u("\u05d2"), u("\u05d3")],
            index=[u("\u05d0"), u("\u05d1")])

        df.to_csv(buf, encoding='UTF-8')
        buf.seek(0)

        df2 = read_csv(buf, index_col=0, encoding='UTF-8')
        assert_frame_equal(df, df2)
Example #20
0
    def test_repr_bool_fails(self):
        s = Series([DataFrame(np.random.randn(2, 2)) for i in range(5)])

        import sys

        buf = StringIO()
        tmp = sys.stderr
        sys.stderr = buf
        try:
            # it works (with no Cython exception barf)!
            repr(s)
        finally:
            sys.stderr = tmp
        self.assertEqual(buf.getvalue(), '')
Example #21
0
    def test_info_memory_usage_qualified(self):

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=[1, 2, 3])
        df.info(buf=buf)
        assert '+' not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=list('ABC'))
        df.info(buf=buf)
        assert '+' in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=pd.MultiIndex.from_product(
                           [range(3), range(3)]))
        df.info(buf=buf)
        assert '+' not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=pd.MultiIndex.from_product(
                           [range(3), ['foo', 'bar']]))
        df.info(buf=buf)
        assert '+' in buf.getvalue()
Example #22
0
    def test_to_csv_quote_none(self):
        # GH4328
        df = DataFrame({'A': ['hello', '{"hello"}']})
        for encoding in (None, 'utf-8'):
            buf = StringIO()
            df.to_csv(buf, quoting=csv.QUOTE_NONE,
                      encoding=encoding, index=False)

            result = buf.getvalue()
            expected_rows = ['A',
                             'hello',
                             '{"hello"}']
            expected = tm.convert_rows_list_to_csv_str(expected_rows)
            assert result == expected
Example #23
0
    def test_info_max_cols(self):
        df = DataFrame(np.random.randn(10, 5))
        for len_, verbose in [(5, None), (5, False), (10, True)]:
            # For verbose always      ^ setting  ^ summarize ^ full output
            with option_context('max_info_columns', 4):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

        for len_, verbose in [(10, None), (5, False), (10, True)]:

            # max_cols no exceeded
            with option_context('max_info_columns', 5):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

        for len_, max_cols in [(10, 5), (5, 4)]:
            # setting truncates
            with option_context('max_info_columns', 4):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

            # setting wouldn't truncate
            with option_context('max_info_columns', 5):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)
Example #24
0
 def test_info_shows_column_dtypes(self):
     dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
               'complex128', 'object', 'bool']
     data = {}
     n = 10
     for i, dtype in enumerate(dtypes):
         data[i] = np.random.randint(2, size=n).astype(dtype)
     df = DataFrame(data)
     buf = StringIO()
     df.info(buf=buf)
     res = buf.getvalue()
     for i, dtype in enumerate(dtypes):
         name = '%d    %d non-null %s' % (i, n, dtype)
         assert name in res
Example #25
0
def test_to_csv_gcs(monkeypatch):
    df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
                     'dt': date_range('2018-06-18', periods=2)})
    s = StringIO()

    class MockGCSFileSystem():
        def open(*args):
            return s

    monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem)
    df1.to_csv('gs://test/test.csv', index=True)
    df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0)

    assert_frame_equal(df1, df2)
Example #26
0
    def test_to_csv_unicodewriter_quoting(self):
        df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

        buf = StringIO()
        df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC,
                  encoding='utf-8')

        result = buf.getvalue()
        expected = ('"A","B"\n'
                    '1,"foo"\n'
                    '2,"bar"\n'
                    '3,"baz"\n')

        self.assertEqual(result, expected)
Example #27
0
    def test_to_csv_unicodewriter_quoting(self):
        df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

        buf = StringIO()
        df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC,
                  encoding='utf-8')

        result = buf.getvalue()
        expected_rows = ['"A","B"',
                         '1,"foo"',
                         '2,"bar"',
                         '3,"baz"']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert result == expected
Example #28
0
    def test_constructor_bad_file(self):
        if is_platform_windows():
            raise nose.SkipTest("skipping construction error messages "
                                "tests on windows")

        non_file = StringIO('I am not a file')
        non_file.fileno = lambda: -1

        msg = "Invalid argument"
        tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file)

        target = open(self.mmap_file, 'r')
        target.close()

        msg = "I/O operation on closed file"
        tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target)
Example #29
0
 def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
     # Redirect output to a queue
     self.queue = StringIO()
     self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
     self.stream = f
     self.encoder = codecs.getincrementalencoder(encoding)()
     self.quoting = kwds.get("quoting", None)
Example #30
0
def _get_pretty_string(obj):
    """Return a prettier version of obj

    Parameters
    ----------
    obj : object
        Object to pretty print

    Returns
    -------
    s : str
        Pretty print object repr
    """
    sio = StringIO()
    pprint.pprint(obj, stream=sio)
    return sio.getvalue()
Example #31
0
 def test_squeeze_no_view(self):
     # see gh-8217
     # Series should not be a view
     data = """time,data\n0,10\n1,11\n2,12\n4,14\n5,15\n3,13"""
     result = self.read_csv(StringIO(data), index_col='time', squeeze=True)
     self.assertFalse(result._is_view)
Example #32
0
def bdi(itype='D', retry_count=3, pause=0.001):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(ct.BDI_URL %
                              (ct.P_TYPE['http'], ct.DOMAINS['v500']))
            lines = urlopen(request, timeout=10).read()
            if len(lines) < 100:  #no data
                return None
        except Exception as e:
            print(e)
        else:
            linestr = lines.decode('utf-8') if ct.PY3 else lines
            if itype == 'D':  # Daily
                reg = re.compile(r'\"chart_data\",\"(.*?)\"\);')
                lines = reg.findall(linestr)
                lines = lines[0]
                lines = lines.replace('chart', 'table').\
                        replace('</series><graphs>', '').\
                        replace('</graphs>', '').\
                        replace('series', 'tr').\
                        replace('value', 'td').\
                        replace('graph', 'tr').\
                        replace('graphs', 'td')
                df = pd.read_html(lines, encoding='utf8')[0]
                df = df.T
                df.columns = ['date', 'index']
                df['date'] = df['date'].map(lambda x: x.replace(u'年', '-')).\
                    map(lambda x: x.replace(u'月', '-')).\
                    map(lambda x: x.replace(u'日', ''))
                df['date'] = pd.to_datetime(df['date'])
                df['index'] = df['index'].astype(float)
                df = df.sort_values('date',
                                    ascending=False).reset_index(drop=True)
                df['change'] = df['index'].pct_change(-1)
                df['change'] = df['change'] * 100
                df['change'] = df['change'].map(lambda x: '%.2f' % x)
                df['change'] = df['change'].astype(float)
                return df
            else:  #Weekly
                html = lxml.html.parse(StringIO(linestr))
                res = html.xpath(
                    "//table[@class=\"style33\"]/tr/td/table[last()]")
                if ct.PY3:
                    sarr = [
                        etree.tostring(node).decode('utf-8') for node in res
                    ]
                else:
                    sarr = [etree.tostring(node) for node in res]
                sarr = ''.join(sarr)
                sarr = '<table>%s</table>' % sarr
                df = pd.read_html(sarr)[0][1:]
                df.columns = ['month', 'index']
                df['month'] = df['month'].map(lambda x: x.replace(u'年', '-')).\
                    map(lambda x: x.replace(u'月', ''))
                df['month'] = pd.to_datetime(df['month'])
                df['month'] = df['month'].map(lambda x: str(x).replace('-', '')).\
                              map(lambda x: x[:6])
                df['index'] = df['index'].astype(float)
                df['change'] = df['index'].pct_change(-1)
                df['change'] = df['change'].map(lambda x: '%.2f' % x)
                df['change'] = df['change'].astype(float)
                return df
Example #33
0
    def test_verbose_import(self):
        text = """a,b,c,d
one,1,2,3
one,1,2,3
,1,2,3
one,1,2,3
,1,2,3
,1,2,3
one,1,2,3
two,1,2,3"""

        buf = StringIO()
        sys.stdout = buf

        try:  # engines are verbose in different ways
            self.read_csv(StringIO(text), verbose=True)
            if self.engine == 'c':
                self.assertIn('Tokenization took:', buf.getvalue())
                self.assertIn('Parser memory cleanup took:', buf.getvalue())
            else:  # Python engine
                self.assertEqual(buf.getvalue(),
                                 'Filled 3 NA values in column a\n')
        finally:
            sys.stdout = sys.__stdout__

        buf = StringIO()
        sys.stdout = buf

        text = """a,b,c,d
one,1,2,3
two,1,2,3
three,1,2,3
four,1,2,3
five,1,2,3
,1,2,3
seven,1,2,3
eight,1,2,3"""

        try:  # engines are verbose in different ways
            self.read_csv(StringIO(text), verbose=True, index_col=0)
            if self.engine == 'c':
                self.assertIn('Tokenization took:', buf.getvalue())
                self.assertIn('Parser memory cleanup took:', buf.getvalue())
            else:  # Python engine
                self.assertEqual(buf.getvalue(),
                                 'Filled 1 NA values in column a\n')
        finally:
            sys.stdout = sys.__stdout__
Example #34
0
    def test_malformed(self):
        # see gh-6607

        # all
        data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
"""
        msg = 'Expected 3 fields in line 4, saw 5'
        with tm.assertRaisesRegexp(Exception, msg):
            self.read_table(StringIO(data), sep=',', header=1, comment='#')

        # first chunk
        data = """ignore
A,B,C
skip
1,2,3
3,5,10 # comment
1,2,3,4,5
2,3,4
"""
        msg = 'Expected 3 fields in line 6, saw 5'
        with tm.assertRaisesRegexp(Exception, msg):
            it = self.read_table(StringIO(data),
                                 sep=',',
                                 header=1,
                                 comment='#',
                                 iterator=True,
                                 chunksize=1,
                                 skiprows=[2])
            it.read(5)

        # middle chunk
        data = """ignore
A,B,C
skip
1,2,3
3,5,10 # comment
1,2,3,4,5
2,3,4
"""
        msg = 'Expected 3 fields in line 6, saw 5'
        with tm.assertRaisesRegexp(Exception, msg):
            it = self.read_table(StringIO(data),
                                 sep=',',
                                 header=1,
                                 comment='#',
                                 iterator=True,
                                 chunksize=1,
                                 skiprows=[2])
            it.read(3)

        # last chunk
        data = """ignore
A,B,C
skip
1,2,3
3,5,10 # comment
1,2,3,4,5
2,3,4
"""
        msg = 'Expected 3 fields in line 6, saw 5'
        with tm.assertRaisesRegexp(Exception, msg):
            it = self.read_table(StringIO(data),
                                 sep=',',
                                 header=1,
                                 comment='#',
                                 iterator=True,
                                 chunksize=1,
                                 skiprows=[2])
            it.read()

        # skip_footer is not supported with the C parser yet
        if self.engine == 'python':
            # skip_footer
            data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
footer
"""
            msg = 'Expected 3 fields in line 4, saw 5'
            with tm.assertRaisesRegexp(Exception, msg):
                self.read_table(StringIO(data),
                                sep=',',
                                header=1,
                                comment='#',
                                skip_footer=1)
Example #35
0
 def test_float_parser(self):
     # see gh-9565
     data = '45e-1,4.5,45.,inf,-inf'
     result = self.read_csv(StringIO(data), header=None)
     expected = DataFrame([[float(s) for s in data.split(',')]])
     tm.assert_frame_equal(result, expected)
Example #36
0
def read_json(path_or_buf=None,
              orient=None,
              typ='frame',
              dtype=True,
              convert_axes=True,
              convert_dates=True,
              keep_default_dates=True,
              numpy=False,
              precise_float=False,
              date_unit=None,
              encoding=None,
              lines=False):
    """
    Convert a JSON string to pandas object

    Parameters
    ----------
    path_or_buf : a valid JSON string or file-like, default: None
        The string could be a URL. Valid URL schemes include http, ftp, s3, and
        file. For file URLs, a host is expected. For instance, a local file
        could be ``file://localhost/path/to/table.json``

    orient : string,
        Indication of expected JSON string format.
        Compatible JSON strings can be produced by ``to_json()`` with a
        corresponding orient value.
        The set of possible orients is:

        - ``'split'`` : dict like
          ``{index -> [index], columns -> [columns], data -> [values]}``
        - ``'records'`` : list like
          ``[{column -> value}, ... , {column -> value}]``
        - ``'index'`` : dict like ``{index -> {column -> value}}``
        - ``'columns'`` : dict like ``{column -> {index -> value}}``
        - ``'values'`` : just the values array

        The allowed and default values depend on the value
        of the `typ` parameter.

        * when ``typ == 'series'``,

          - allowed orients are ``{'split','records','index'}``
          - default is ``'index'``
          - The Series index must be unique for orient ``'index'``.

        * when ``typ == 'frame'``,

          - allowed orients are ``{'split','records','index',
            'columns','values'}``
          - default is ``'columns'``
          - The DataFrame index must be unique for orients ``'index'`` and
            ``'columns'``.
          - The DataFrame columns must be unique for orients ``'index'``,
            ``'columns'``, and ``'records'``.

    typ : type of object to recover (series or frame), default 'frame'
    dtype : boolean or dict, default True
        If True, infer dtypes, if a dict of column to dtype, then use those,
        if False, then don't infer dtypes at all, applies only to the data.
    convert_axes : boolean, default True
        Try to convert the axes to the proper dtypes.
    convert_dates : boolean, default True
        List of columns to parse for dates; If True, then try to parse
        datelike columns default is True; a column label is datelike if

        * it ends with ``'_at'``,

        * it ends with ``'_time'``,

        * it begins with ``'timestamp'``,

        * it is ``'modified'``, or

        * it is ``'date'``

    keep_default_dates : boolean, default True
        If parsing dates, then parse the default datelike columns
    numpy : boolean, default False
        Direct decoding to numpy arrays. Supports numeric data only, but
        non-numeric column and index labels are supported. Note also that the
        JSON ordering MUST be the same for each term if numpy=True.
    precise_float : boolean, default False
        Set to enable usage of higher precision (strtod) function when
        decoding string to double values. Default (False) is to use fast but
        less precise builtin functionality
    date_unit : string, default None
        The timestamp unit to detect if converting dates. The default behaviour
        is to try and detect the correct precision, but if this is not desired
        then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
        milliseconds, microseconds or nanoseconds respectively.
    lines : boolean, default False
        Read the file as a json object per line.

        .. versionadded:: 0.19.0

    encoding : str, default is 'utf-8'
        The encoding to use to decode py3 bytes.

        .. versionadded:: 0.19.0

    Returns
    -------
    result : Series or DataFrame, depending on the value of `typ`.

    See Also
    --------
    DataFrame.to_json

    Examples
    --------

    >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
    ...                   index=['row 1', 'row 2'],
    ...                   columns=['col 1', 'col 2'])

    Encoding/decoding a Dataframe using ``'split'`` formatted JSON:

    >>> df.to_json(orient='split')
    '{"columns":["col 1","col 2"],
      "index":["row 1","row 2"],
      "data":[["a","b"],["c","d"]]}'
    >>> pd.read_json(_, orient='split')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'index'`` formatted JSON:

    >>> df.to_json(orient='index')
    '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
    >>> pd.read_json(_, orient='index')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
    Note that index labels are not preserved with this encoding.

    >>> df.to_json(orient='records')
    '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
    >>> pd.read_json(_, orient='records')
      col 1 col 2
    0     a     b
    1     c     d
    """

    filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                      encoding=encoding)
    if isinstance(filepath_or_buffer, compat.string_types):
        try:
            exists = os.path.exists(filepath_or_buffer)

        # if the filepath is too long will raise here
        # 5874
        except (TypeError, ValueError):
            exists = False

        if exists:
            with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh:
                json = fh.read()
        else:
            json = filepath_or_buffer
    elif hasattr(filepath_or_buffer, 'read'):
        json = filepath_or_buffer.read()
    else:
        json = filepath_or_buffer

    if lines:
        # If given a json lines file, we break the string into lines, add
        # commas and put it in a json list to make a valid json object.
        lines = list(StringIO(json.strip()))
        json = u'[' + u','.join(lines) + u']'

    obj = None
    if typ == 'frame':
        obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
                          keep_default_dates, numpy, precise_float,
                          date_unit).parse()

    if typ == 'series' or obj is None:
        if not isinstance(dtype, bool):
            dtype = dict(data=dtype)
        obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates,
                           keep_default_dates, numpy, precise_float,
                           date_unit).parse()

    return obj
Example #37
0
 def _test(text, **kwargs):
     nice_text = text.replace('\r', '\r\n')
     result = TextReader(StringIO(text), **kwargs).read()
     expected = TextReader(StringIO(nice_text), **kwargs).read()
     assert_array_dicts_equal(result, expected)
Example #38
0
 def test_ignore_leading_whitespace(self):
     # see gh-3374, gh-6607
     data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9'
     result = self.read_table(StringIO(data), sep='\s+')
     expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]})
     tm.assert_frame_equal(result, expected)
Example #39
0
 def test_empty_with_index(self):
     # see gh-10184
     data = 'x,y'
     result = self.read_csv(StringIO(data), index_col=0)
     expected = DataFrame([], columns=['y'], index=Index([], name='x'))
     tm.assert_frame_equal(result, expected)
Example #40
0
def top_list(date = None, retry_count=3, pause=0.001):
    """
    获取每日龙虎榜列表
    Parameters
    --------
    date:string
                明细数据日期 format:YYYY-MM-DD 如果为空,返回最近一个交易日的数据
    retry_count : int, 默认 3
                 如遇网络等问题重复执行的次数 
    pause : int, 默认 0
                重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题
    
    Return
    ------
    DataFrame
        code:代码
        name :名称
        pchange:涨跌幅     
        amount:龙虎榜成交额(万)
        buy:买入额(万)
        bratio:占总成交比例
        sell:卖出额(万)
        sratio :占总成交比例
        reason:上榜原因
        date  :日期
    """
    if date is None:
        if du.get_hour() < 18:
            date = du.last_tddate()
        else:
            date = du.today() 
    else:
        if(du.is_holiday(date)):
            return None
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_URL%(ct.P_TYPE['http'], ct.DOMAINS['em'], date))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dt_1\"]")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr)[0]
            df.columns = [i for i in range(1,12)]
            df = df.apply(_f_rows, axis=1)
            df = df.fillna(method='ffill')
            df = df.drop([1, 4], axis=1)
            df.columns = rv.LHB_COLS
            df = df.drop_duplicates()
            df['code'] = df['code'].astype(int)
            df['code'] = df['code'].map(lambda x: str(x).zfill(6))
            df['date'] = date
        except:
            pass
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)
Example #41
0
    def test_iterator(self):
        # See gh-6607
        reader = self.read_csv(StringIO(self.data1),
                               index_col=0,
                               iterator=True)
        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunk = reader.read(3)
        tm.assert_frame_equal(chunk, df[:3])

        last_chunk = reader.read(5)
        tm.assert_frame_equal(last_chunk, df[3:])

        # pass list
        lines = list(csv.reader(StringIO(self.data1)))
        parser = TextParser(lines, index_col=0, chunksize=2)

        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[:2])
        tm.assert_frame_equal(chunks[1], df[2:4])
        tm.assert_frame_equal(chunks[2], df[4:])

        # pass skiprows
        parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[1:3])

        treader = self.read_table(StringIO(self.data1),
                                  sep=',',
                                  index_col=0,
                                  iterator=True)
        tm.assertIsInstance(treader, TextFileReader)

        # gh-3967: stopping iteration when chunksize is specified
        data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
        reader = self.read_csv(StringIO(data), iterator=True)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[3, 6, 9]),
                             index=['foo', 'bar', 'baz'])
        tm.assert_frame_equal(result[0], expected)

        # chunksize = 1
        reader = self.read_csv(StringIO(data), chunksize=1)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[3, 6, 9]),
                             index=['foo', 'bar', 'baz'])
        self.assertEqual(len(result), 3)
        tm.assert_frame_equal(pd.concat(result), expected)

        # skip_footer is not supported with the C parser yet
        if self.engine == 'python':
            # test bad parameter (skip_footer)
            reader = self.read_csv(StringIO(self.data1),
                                   index_col=0,
                                   iterator=True,
                                   skip_footer=True)
            self.assertRaises(ValueError, reader.read, 3)
Example #42
0
    def test_value_counts_datetime64(self, klass):

        # GH 3002, datetime64[ns]
        # don't test names though
        txt = "\n".join([
            'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
            'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'
        ])
        f = StringIO(txt)
        df = pd.read_fwf(f,
                         widths=[6, 8, 3],
                         names=["person_id", "dt", "food"],
                         parse_dates=["dt"])

        s = klass(df['dt'].copy())
        s.name = None
        idx = pd.to_datetime([
            '2010-01-01 00:00:00', '2008-09-09 00:00:00', '2009-01-01 00:00:00'
        ])
        expected_s = Series([3, 2, 1], index=idx)
        tm.assert_series_equal(s.value_counts(), expected_s)

        expected = np_array_datetime64_compat([
            '2010-01-01 00:00:00', '2009-01-01 00:00:00', '2008-09-09 00:00:00'
        ],
                                              dtype='datetime64[ns]')
        if isinstance(s, Index):
            tm.assert_index_equal(s.unique(), DatetimeIndex(expected))
        else:
            tm.assert_numpy_array_equal(s.unique(), expected)

        assert s.nunique() == 3

        # with NaT
        s = df['dt'].copy()
        s = klass([v for v in s.values] + [pd.NaT])

        result = s.value_counts()
        assert result.index.dtype == 'datetime64[ns]'
        tm.assert_series_equal(result, expected_s)

        result = s.value_counts(dropna=False)
        expected_s[pd.NaT] = 1
        tm.assert_series_equal(result, expected_s)

        unique = s.unique()
        assert unique.dtype == 'datetime64[ns]'

        # numpy_array_equal cannot compare pd.NaT
        if isinstance(s, Index):
            exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT])
            tm.assert_index_equal(unique, exp_idx)
        else:
            tm.assert_numpy_array_equal(unique[:3], expected)
            assert pd.isna(unique[3])

        assert s.nunique() == 3
        assert s.nunique(dropna=False) == 4

        # timedelta64[ns]
        td = df.dt - df.dt + timedelta(1)
        td = klass(td, name='dt')

        result = td.value_counts()
        expected_s = Series([6], index=[Timedelta('1day')], name='dt')
        tm.assert_series_equal(result, expected_s)

        expected = TimedeltaIndex(['1 days'], name='dt')
        if isinstance(td, Index):
            tm.assert_index_equal(td.unique(), expected)
        else:
            tm.assert_numpy_array_equal(td.unique(), expected.values)

        td2 = timedelta(1) + (df.dt - df.dt)
        td2 = klass(td2, name='dt')
        result2 = td2.value_counts()
        tm.assert_series_equal(result2, expected_s)
Example #43
0
 def test_dtype_name_in_info(self, data):
     buf = StringIO()
     pd.DataFrame({"A": data}).info(buf=buf)
     result = buf.getvalue()
     assert data.dtype.name in result
Example #44
0
def csv_to_df(text):    df = pd.read_csv(StringIO(bytes_to_str(text)), index_col=0, parse_dates=True, infer_datetime_format=True, na_values='-')[::-1]
 # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(df) > 2 and df.index[-1] == df.index[-2]:  # pragma: no cover        df = df[:-1]
 # Get rid of unicode characters in index name. try:        df.index.name = df.index.name.decode('unicode_escape').encode('ascii', 'ignore') except AttributeError: # Python 3 string has no decode method.        df.index.name = df.index.name.encode('ascii', 'ignore').decode()
    column_renames = {'Adj. Open': 'Adj Open', 'Adj. High': 'Adj High', 'Adj. Low': 'Adj Low', 'Adj. Close': 'Adj Close', 'Adj. Volume': 'Adj Volume'}    df.rename(columns=column_renames, inplace=True) return df.tz_localize(pytz.UTC)
Example #45
0
 def test_raise_on_sep_with_delim_whitespace(self):
     # see gh-6607
     data = 'a b c\n1 2 3'
     with tm.assertRaisesRegexp(ValueError, 'you can only specify one'):
         self.read_table(StringIO(data), sep='\s', delim_whitespace=True)
Example #46
0
 def test_string_factorize(self):
     # should this be optional?
     data = 'a\nb\na\nb\na'
     reader = TextReader(StringIO(data), header=None)
     result = reader.read()
     assert len(set(map(id, result[0]))) == 2
Example #47
0
    def test_eof_states(self):
        # see gh-10728, gh-10548

        # With skip_blank_lines = True
        expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c'])

        # gh-10728: WHITESPACE_LINE
        data = 'a,b,c\n4,5,6\n '
        result = self.read_csv(StringIO(data))
        tm.assert_frame_equal(result, expected)

        # gh-10548: EAT_LINE_COMMENT
        data = 'a,b,c\n4,5,6\n#comment'
        result = self.read_csv(StringIO(data), comment='#')
        tm.assert_frame_equal(result, expected)

        # EAT_CRNL_NOP
        data = 'a,b,c\n4,5,6\n\r'
        result = self.read_csv(StringIO(data))
        tm.assert_frame_equal(result, expected)

        # EAT_COMMENT
        data = 'a,b,c\n4,5,6#comment'
        result = self.read_csv(StringIO(data), comment='#')
        tm.assert_frame_equal(result, expected)

        # SKIP_LINE
        data = 'a,b,c\n4,5,6\nskipme'
        result = self.read_csv(StringIO(data), skiprows=[2])
        tm.assert_frame_equal(result, expected)

        # With skip_blank_lines = False

        # EAT_LINE_COMMENT
        data = 'a,b,c\n4,5,6\n#comment'
        result = self.read_csv(StringIO(data),
                               comment='#',
                               skip_blank_lines=False)
        expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c'])
        tm.assert_frame_equal(result, expected)

        # IN_FIELD
        data = 'a,b,c\n4,5,6\n '
        result = self.read_csv(StringIO(data), skip_blank_lines=False)
        expected = DataFrame([['4', 5, 6], [' ', None, None]],
                             columns=['a', 'b', 'c'])
        tm.assert_frame_equal(result, expected)

        # EAT_CRNL
        data = 'a,b,c\n4,5,6\n\r'
        result = self.read_csv(StringIO(data), skip_blank_lines=False)
        expected = DataFrame([[4, 5, 6], [None, None, None]],
                             columns=['a', 'b', 'c'])
        tm.assert_frame_equal(result, expected)

        # Should produce exceptions

        # ESCAPED_CHAR
        data = "a,b,c\n4,5,6\n\\"
        self.assertRaises(Exception,
                          self.read_csv,
                          StringIO(data),
                          escapechar='\\')

        # ESCAPE_IN_QUOTED_FIELD
        data = 'a,b,c\n4,5,6\n"\\'
        self.assertRaises(Exception,
                          self.read_csv,
                          StringIO(data),
                          escapechar='\\')

        # IN_QUOTED_FIELD
        data = 'a,b,c\n4,5,6\n"'
        self.assertRaises(Exception,
                          self.read_csv,
                          StringIO(data),
                          escapechar='\\')
Example #48
0
 def test_single_line(self):
     # see gh-6607: sniff separator
     df = self.read_csv(StringIO('1,2'), names=['a', 'b'],
                        header=None, sep=None)
     tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df)
Example #49
0
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEqual(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is platform-dep
            hist = s.value_counts(sort=False)
            hist.sort()
            expected = Series([3, 1, 4, 2], index=list('acbd'))
            expected.sort()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list('cdab'))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError,
                              lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({
                0.998: 2,
                1.5: 1,
                2.0: 0,
                2.5: 1
            },
                          index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({
                0.998: 0.5,
                1.5: 0.25,
                2.0: 0.0,
                2.5: 0.25
            },
                           index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = [
                'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'
            ]
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(
                s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O'))
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEqual(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            txt = "\n".join([
                'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
                'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'
            ])
            f = StringIO(txt)
            df = pd.read_fwf(f,
                             widths=[6, 8, 3],
                             names=["person_id", "dt", "food"],
                             parse_dates=["dt"])

            s = klass(df['dt'].copy(), name='dt')

            idx = pd.to_datetime([
                '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z',
                '2009-01-01 00:00:00X'
            ])
            expected_s = Series([3, 2, 1], index=idx, name='dt')
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np.array([
                '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z',
                '2008-09-09 00:00:00Z'
            ],
                                dtype='datetime64[ns]')
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assertTrue(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEqual(s.nunique(), 3)

            # with NaT
            s = df['dt'].copy()
            s = klass([v for v in s.values] + [pd.NaT], name='dt')

            result = s.value_counts()
            self.assertEqual(result.index.dtype, 'datetime64[ns]')
            tm.assert_series_equal(result, expected_s)

            result = s.value_counts(dropna=False)
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, 'datetime64[ns]')
            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT
                            or unique[3].astype('int64') == pd.tslib.iNaT)

            self.assertEqual(s.nunique(), 3)
            self.assertEqual(s.nunique(dropna=False), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td, name='dt')

            result = td.value_counts()
            expected_s = Series([6], index=[Timedelta('1day')], name='dt')
            tm.assert_series_equal(result, expected_s)

            expected = TimedeltaIndex(['1 days'])
            if isinstance(td, TimedeltaIndex):
                self.assertTrue(td.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(td.unique(), expected.values)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2, name='dt')
            result2 = td2.value_counts()
            tm.assert_series_equal(result2, expected_s)
Example #50
0
    def test_multiple_date_col(self):
        # Can use multiple date parsers
        data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""

        def func(*date_cols):
            res = parsing.try_parse_dates(parsers._concat_date_cols(date_cols))
            return res

        df = self.read_csv(StringIO(data),
                           header=None,
                           date_parser=func,
                           prefix='X',
                           parse_dates={
                               'nominal': [1, 2],
                               'actual': [1, 3]
                           })
        assert 'nominal' in df
        assert 'actual' in df
        assert 'X1' not in df
        assert 'X2' not in df
        assert 'X3' not in df

        d = datetime(1999, 1, 27, 19, 0)
        assert df.loc[0, 'nominal'] == d

        df = self.read_csv(StringIO(data),
                           header=None,
                           date_parser=func,
                           parse_dates={
                               'nominal': [1, 2],
                               'actual': [1, 3]
                           },
                           keep_date_col=True)
        assert 'nominal' in df
        assert 'actual' in df

        assert 1 in df
        assert 2 in df
        assert 3 in df

        data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
        df = self.read_csv(StringIO(data),
                           header=None,
                           prefix='X',
                           parse_dates=[[1, 2], [1, 3]])

        assert 'X1_X2' in df
        assert 'X1_X3' in df
        assert 'X1' not in df
        assert 'X2' not in df
        assert 'X3' not in df

        d = datetime(1999, 1, 27, 19, 0)
        assert df.loc[0, 'X1_X2'] == d

        df = self.read_csv(StringIO(data),
                           header=None,
                           parse_dates=[[1, 2], [1, 3]],
                           keep_date_col=True)

        assert '1_2' in df
        assert '1_3' in df
        assert 1 in df
        assert 2 in df
        assert 3 in df

        data = '''\
KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
'''
        df = self.read_csv(StringIO(data),
                           sep=',',
                           header=None,
                           parse_dates=[1],
                           index_col=1)
        d = datetime(1999, 1, 27, 19, 0)
        assert df.index[0] == d