Beispiel #1
0
    def test_fwf_colspecs_None(self):
        # GH 7079
        data = """\
123456
456789
"""
        colspecs = [(0, 3), (3, None)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123, 456], [456, 789]])
        tm.assert_frame_equal(result, expected)

        colspecs = [(None, 3), (3, 6)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123, 456], [456, 789]])
        tm.assert_frame_equal(result, expected)

        colspecs = [(0, None), (3, None)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123456, 456], [456789, 789]])
        tm.assert_frame_equal(result, expected)

        colspecs = [(None, None), (3, 6)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123456, 456], [456789, 789]])
        tm.assert_frame_equal(result, expected)
Beispiel #2
0
    def test_skiprows_inference_empty(self):
        test = """
AA   BBB  C
12   345  6
78   901  2
""".strip()

        with pytest.raises(EmptyDataError):
            read_fwf(StringIO(test), skiprows=3)
Beispiel #3
0
def test_skiprows_inference_empty():
    data = """
AA   BBB  C
12   345  6
78   901  2
""".strip()

    msg = "No rows from which to infer column width"
    with pytest.raises(EmptyDataError, match=msg):
        read_fwf(StringIO(data), skiprows=3)
Beispiel #4
0
def test_under_specified():
    data = """\
A   B     C            D            E
201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4
201160    364.136849   183.628767   11806.2
201161    413.836124   184.375703   11916.8
201162    502.953953   173.237159   12468.3
"""
    with pytest.raises(ValueError, match="Must specify either"):
        read_fwf(StringIO(data), colspecs=None, widths=None)
Beispiel #5
0
    def test_bool_header_arg(self):
        # see gh-6114
        data = """\
MyColumn
   a
   b
   a
   b"""
        for arg in [True, False]:
            with pytest.raises(TypeError):
                read_fwf(StringIO(data), header=arg)
Beispiel #6
0
def test_bool_header_arg(header):
    # see gh-6114
    data = """\
MyColumn
   a
   b
   a
   b"""

    msg = "Passing a bool to header is invalid"
    with pytest.raises(TypeError, match=msg):
        read_fwf(StringIO(data), header=header)
Beispiel #7
0
def test_variable_width_unicode():
    data = """
שלום שלום
ום   שלל
של   ום
""".strip("\r\n")
    encoding = "utf8"
    kwargs = dict(header=None, encoding=encoding)

    expected = read_fwf(BytesIO(data.encode(encoding)),
                        colspecs=[(0, 4), (5, 9)], **kwargs)
    result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
    tm.assert_frame_equal(result, expected)
Beispiel #8
0
    def test_full_file(self):
        # File with all values
        test = '''index                             A    B    C
2000-01-03T00:00:00  0.980268513777    3  foo
2000-01-04T00:00:00  1.04791624281    -4  bar
2000-01-05T00:00:00  0.498580885705   73  baz
2000-01-06T00:00:00  1.12020151869     1  foo
2000-01-07T00:00:00  0.487094399463    0  bar
2000-01-10T00:00:00  0.836648671666    2  baz
2000-01-11T00:00:00  0.157160753327   34  foo'''
        colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
        expected = read_fwf(StringIO(test), colspecs=colspecs)
        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
Beispiel #9
0
    def test_full_file_with_spaces(self):
        # File with spaces in columns
        test = """
Account                 Name  Balance     CreditLimit   AccountCreated
101     Keanu Reeves          9315.45     10000.00           1/17/1998
312     Gerard Butler         90.00       1000.00             8/6/2003
868     Jennifer Love Hewitt  0           17000.00           5/25/1985
761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
317     Bill Murray           789.65      5000.00             2/5/2007
""".strip('\r\n')
        colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
        expected = read_fwf(StringIO(test), colspecs=colspecs)
        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
Beispiel #10
0
    def test_full_file_with_spaces_and_missing(self):
        # File with spaces and missing values in columsn
        test = """
Account               Name    Balance     CreditLimit   AccountCreated
101                           10000.00                       1/17/1998
312     Gerard Butler         90.00       1000.00             8/6/2003
868                                                          5/25/1985
761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
317     Bill Murray           789.65
""".strip('\r\n')
        colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
        expected = read_fwf(StringIO(test), colspecs=colspecs)
        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
Beispiel #11
0
    def test_messed_up_data(self):
        # Completely messed up file
        test = """
   Account          Name             Balance     Credit Limit   Account Created
       101                           10000.00                       1/17/1998
       312     Gerard Butler         90.00       1000.00

       761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
  317          Bill Murray           789.65
""".strip('\r\n')
        colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
        expected = read_fwf(StringIO(test), colspecs=colspecs)
        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
Beispiel #12
0
def test_over_specified():
    data = """\
A   B     C            D            E
201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4
201160    364.136849   183.628767   11806.2
201161    413.836124   184.375703   11916.8
201162    502.953953   173.237159   12468.3
"""
    colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]

    with pytest.raises(ValueError, match="must specify only one of"):
        read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7])
Beispiel #13
0
    def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self):
        data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""

        with tm.assertRaisesRegexp(TypeError,
                                   'Each column specification must be.+'):
            read_fwf(StringIO(data), [('a', 1)])
Beispiel #14
0
    def test_full_file_with_missing(self):
        # File with missing values
        test = """index                             A    B    C
2000-01-03T00:00:00  0.980268513777    3  foo
2000-01-04T00:00:00  1.04791624281    -4  bar
                     0.498580885705   73  baz
2000-01-06T00:00:00  1.12020151869     1  foo
2000-01-07T00:00:00                    0  bar
2000-01-10T00:00:00  0.836648671666    2  baz
                                      34"""
        colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
        expected = read_fwf(StringIO(test), colspecs=colspecs)
        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
Beispiel #15
0
    def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self):
        data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""

        msg = 'Each column specification must be.+'
        with pytest.raises(TypeError, match=msg):
            read_fwf(StringIO(data), [('a', 1)])
Beispiel #16
0
    def test_variable_width_unicode(self):
        if not compat.PY3:
            pytest.skip(
                'Bytes-related test - only needs to work on Python 3')
        test = """
שלום שלום
ום   שלל
של   ום
""".strip('\r\n')
        expected = read_fwf(BytesIO(test.encode('utf8')),
                            colspecs=[(0, 4), (5, 9)],
                            header=None, encoding='utf8')
        tm.assert_frame_equal(expected, read_fwf(
            BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
Beispiel #17
0
    def test_multiple_delimiters(self):
        test = r"""
col1~~~~~col2  col3++++++++++++++++++col4
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
  33+++122.33\\\bar.........Gerard Butler
++44~~~~12.01   baz~~Jennifer Love Hewitt
~~55       11+++foo++++Jada Pinkett-Smith
..66++++++.03~~~bar           Bill Murray
""".strip('\r\n')
        colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
        expected = read_fwf(StringIO(test), colspecs=colspecs,
                            delimiter=' +~.\\')
        tm.assert_frame_equal(expected, read_fwf(StringIO(test),
                                                 delimiter=' +~.\\'))
Beispiel #18
0
def test_fwf_colspecs_is_list_or_tuple():
    data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""

    msg = "column specifications must be a list or tuple.+"

    with pytest.raises(TypeError, match=msg):
        read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",")
Beispiel #19
0
    def test_fwf_colspecs_infer_nrows(self):
        # GH 15138
        data = """\
  1  2
123 98
"""
        # infer_nrows == 1 should have colspec == [(2, 3), (5, 6)]
        df = read_fwf(StringIO(data), header=None, infer_nrows=1)
        expected = pd.DataFrame([[1, 2], [3, 8]])
        tm.assert_frame_equal(df, expected)

        # test for infer_nrows > number of rows
        df = read_fwf(StringIO(data), header=None, infer_nrows=10)
        expected = pd.DataFrame([[1, 2], [123, 98]])
        tm.assert_frame_equal(df, expected)
Beispiel #20
0
    def test_fwf_regression(self):
        # GH 3594
        # turns out 'T060' is parsable as a datetime slice!

        tzlist = [1, 10, 20, 30, 60, 80, 100]
        ntz = len(tzlist)
        tcolspecs = [16] + [8] * ntz
        tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]]
        data = """  2009164202000   9.5403  9.4105  8.6571  7.8372  6.0612  5.8843  5.5192
  2009164203000   9.5435  9.2010  8.6167  7.8176  6.0804  5.8728  5.4869
  2009164204000   9.5873  9.1326  8.4694  7.5889  6.0422  5.8526  5.4657
  2009164205000   9.5810  9.0896  8.4009  7.4652  6.0322  5.8189  5.4379
  2009164210000   9.6034  9.0897  8.3822  7.4905  6.0908  5.7904  5.4039
"""

        df = read_fwf(StringIO(data),
                      index_col=0,
                      header=None,
                      names=tcolnames,
                      widths=tcolspecs,
                      parse_dates=True,
                      date_parser=lambda s: datetime.strptime(s, '%Y%j%H%M%S'))

        for c in df.columns:
            res = df.loc[:, c]
            assert len(res)
Beispiel #21
0
def test_non_space_filler():
    # From Thomas Kluyver:
    #
    # Apparently, some non-space filler characters can be seen, this is
    # supported by specifying the 'delimiter' character:
    #
    # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
    data = """\
A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E
201158~~~~360.242940~~~149.910199~~~11950.7
201159~~~~444.953632~~~166.985655~~~11788.4
201160~~~~364.136849~~~183.628767~~~11806.2
201161~~~~413.836124~~~184.375703~~~11916.8
201162~~~~502.953953~~~173.237159~~~12468.3
"""
    colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
    result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~")

    expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
                          [2011, 59, 444.953632, 166.985655, 11788.4],
                          [2011, 60, 364.136849, 183.628767, 11806.2],
                          [2011, 61, 413.836124, 184.375703, 11916.8],
                          [2011, 62, 502.953953, 173.237159, 12468.3]],
                         columns=["A", "B", "C", "D", "E"])
    tm.assert_frame_equal(result, expected)
Beispiel #22
0
def test_variable_width_unicode():
    if not compat.PY3:
        pytest.skip("Bytes-related test - only needs to work on Python 3")

    data = """
שלום שלום
ום   שלל
של   ום
""".strip("\r\n")
    encoding = "utf8"
    kwargs = dict(header=None, encoding=encoding)

    expected = read_fwf(BytesIO(data.encode(encoding)),
                        colspecs=[(0, 4), (5, 9)], **kwargs)
    result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
    tm.assert_frame_equal(result, expected)
Beispiel #23
0
def test_fwf_regression():
    # see gh-3594
    #
    # Turns out "T060" is parsable as a datetime slice!
    tz_list = [1, 10, 20, 30, 60, 80, 100]
    widths = [16] + [8] * len(tz_list)
    names = ["SST"] + ["T%03d" % z for z in tz_list[1:]]

    data = """  2009164202000   9.5403  9.4105  8.6571  7.8372  6.0612  5.8843  5.5192
2009164203000   9.5435  9.2010  8.6167  7.8176  6.0804  5.8728  5.4869
2009164204000   9.5873  9.1326  8.4694  7.5889  6.0422  5.8526  5.4657
2009164205000   9.5810  9.0896  8.4009  7.4652  6.0322  5.8189  5.4379
2009164210000   9.6034  9.0897  8.3822  7.4905  6.0908  5.7904  5.4039
"""

    result = read_fwf(StringIO(data), index_col=0, header=None, names=names,
                      widths=widths, parse_dates=True,
                      date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"))
    expected = DataFrame([
        [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192],
        [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869],
        [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657],
        [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379],
        [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039],
    ], index=DatetimeIndex(["2009-06-13 20:20:00", "2009-06-13 20:30:00",
                            "2009-06-13 20:40:00", "2009-06-13 20:50:00",
                            "2009-06-13 21:00:00"]),
        columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"])
    tm.assert_frame_equal(result, expected)
def getdept(txt,datereq):
    firstheadpat = compile('\d{2}/\d{2}/\d{4}') #not for zip files!
    #trainpat = compile('(?<=\* Train )\d+')
    lastheadpat = compile('^\* V')
    datestr = None
    with StringIO(txt) as inpt:
        for line in inpt:
            tmp = firstheadpat.findall(line)
            if len(tmp) >0:
                datestr = tmp[0]
            if len(lastheadpat.findall(line)) > 0:
                if datestr is None:
                    # must be a zip file where no dates are give
                    datestr = datereq.strftime('%m/%d/%Y')
                break
        #data = read_fwf(inpt,colspecs=[(2,5),(10,15),(16,17),(19,24),(25,30),(31,36)],skiprows=0)
        #data.columns = ['city','skedarv','skeddep','actarv','actdep']
        data = read_fwf(inpt,
                        colspecs=[(2,5),(16,17),(19,24),(31,36)],
                        index_col=0,
                        header=None,
                        skiprows=0)

#%% append last arrival (destination)
    arv = getarv(txt)
#%% drop blank rows before appending arrival
    data = data.dropna(axis=0,how='all') #needed for trailing blank lines
    data = data.replace('*',nan) #now that blank lines are gone, we swap for nan

    data.ix[-1] = arv.ix[0] #we know arrival is one line, the last line of the file
    data.columns = ['day','sked','act']

    return data, datestr
Beispiel #25
0
def test_fwf_compression(compression):
    data = """1111111111
    2222222222
    3333333333""".strip()

    kwargs = dict(widths=[5, 5], names=["one", "two"])
    expected = read_fwf(StringIO(data), **kwargs)

    if compat.PY3:
        data = bytes(data, encoding="utf-8")

    with tm.ensure_clean() as path:
        tm.write_to_compressed(compression, path, data)

        result = read_fwf(path, compression=compression, **kwargs)
        tm.assert_frame_equal(result, expected)
Beispiel #26
0
def test_bytes_io_input():
    if not compat.PY3:
        pytest.skip("Bytes-related test - only needs to work on Python 3")

    result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')),
                      widths=[2, 2], encoding="utf8")
    expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
    tm.assert_frame_equal(result, expected)
Beispiel #27
0
    def test_fwf(self):
        data_expected = """\
2011,58,360.242940,149.910199,11950.7
2011,59,444.953632,166.985655,11788.4
2011,60,364.136849,183.628767,11806.2
2011,61,413.836124,184.375703,11916.8
2011,62,502.953953,173.237159,12468.3
"""
        expected = read_csv(StringIO(data_expected), header=None)

        data1 = """\
201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4
201160    364.136849   183.628767   11806.2
201161    413.836124   184.375703   11916.8
201162    502.953953   173.237159   12468.3
"""
        colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
        df = read_fwf(StringIO(data1), colspecs=colspecs, header=None)
        assert_frame_equal(df, expected)

        data2 = """\
2011 58   360.242940   149.910199   11950.7
2011 59   444.953632   166.985655   11788.4
2011 60   364.136849   183.628767   11806.2
2011 61   413.836124   184.375703   11916.8
2011 62   502.953953   173.237159   12468.3
"""
        df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None)
        assert_frame_equal(df, expected)

        # From Thomas Kluyver: apparently some non-space filler characters can
        # be seen, this is supported by specifying the 'delimiter' character:
        # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
        data3 = """\
201158~~~~360.242940~~~149.910199~~~11950.7
201159~~~~444.953632~~~166.985655~~~11788.4
201160~~~~364.136849~~~183.628767~~~11806.2
201161~~~~413.836124~~~184.375703~~~11916.8
201162~~~~502.953953~~~173.237159~~~12468.3
"""
        df = read_fwf(StringIO(data3), colspecs=colspecs, delimiter='~', header=None)
        assert_frame_equal(df, expected)

        self.assertRaises(ValueError, read_fwf, StringIO(data3),
                          colspecs=colspecs, widths=[6, 10, 10, 7])
Beispiel #28
0
    def test_BytesIO_input(self):
        if not compat.PY3:
            raise nose.SkipTest(
                "Bytes-related test - only needs to work on Python 3")

        result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[
            2, 2], encoding='utf8')
        expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
        tm.assert_frame_equal(result, expected)
Beispiel #29
0
def test_skiprows_passing_as_positional_deprecated():
    # GH#41485
    data = """0
1
2
"""
    with tm.assert_produces_warning(FutureWarning, match="keyword-only"):
        result = read_fwf(StringIO(data), [(0, 2)])
    expected = DataFrame({"0": [1, 2]})
    tm.assert_frame_equal(result, expected)
Beispiel #30
0
    def test_1000_fwf(self):
        data = """
 1 2,334.0    5
10   13     10.
"""
        expected = np.array([[1, 2334., 5], [10, 13, 10]])
        df = read_fwf(StringIO(data),
                      colspecs=[(0, 3), (3, 11), (12, 16)],
                      thousands=',')
        tm.assert_almost_equal(df.values, expected)
Beispiel #31
0
    def test_1000_fwf(self):
        data = """
 1 2,334.0    5
10   13     10.
"""
        expected = [[1, 2334., 5],
                    [10, 13, 10]]
        df = read_fwf(StringIO(data), colspecs=[(0,3),(3,11),(12,16)],
                      thousands=',')
        assert_almost_equal(df.values, expected)
Beispiel #32
0
    def test_comment_fwf(self):
        data = """
  1   2.   4  #hello world
  5  NaN  10.0
"""
        expected = [[1, 2., 4],
                    [5, np.nan, 10.]]
        df = read_fwf(StringIO(data), colspecs=[(0,3),(4,9),(9,25)],
                      comment='#')
        assert_almost_equal(df.values, expected)
Beispiel #33
0
    def test_comment_fwf(self):
        data = """
  1   2.   4  #hello world
  5  NaN  10.0
"""
        expected = np.array([[1, 2., 4], [5, np.nan, 10.]])
        df = read_fwf(StringIO(data),
                      colspecs=[(0, 3), (4, 9), (9, 25)],
                      comment='#')
        tm.assert_almost_equal(df.values, expected)
Beispiel #34
0
def test_fwf_skip_blank_lines():
    data = """

A         B            C            D

201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4


201162    502.953953   173.237159   12468.3

"""
    result = read_fwf(StringIO(data), skip_blank_lines=True)
    expected = DataFrame(
        [
            [201158, 360.242940, 149.910199, 11950.7],
            [201159, 444.953632, 166.985655, 11788.4],
            [201162, 502.953953, 173.237159, 12468.3],
        ],
        columns=["A", "B", "C", "D"],
    )
    tm.assert_frame_equal(result, expected)

    data = """\
A         B            C            D
201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4


201162    502.953953   173.237159   12468.3
"""
    result = read_fwf(StringIO(data), skip_blank_lines=False)
    expected = DataFrame(
        [
            [201158, 360.242940, 149.910199, 11950.7],
            [201159, 444.953632, 166.985655, 11788.4],
            [np.nan, np.nan, np.nan, np.nan],
            [np.nan, np.nan, np.nan, np.nan],
            [201162, 502.953953, 173.237159, 12468.3],
        ],
        columns=["A", "B", "C", "D"],
    )
    tm.assert_frame_equal(result, expected)
def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data):
    # see gh-15138
    data = """\
  1  2
123 98
"""
    expected = DataFrame(exp_data)

    result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None)
    tm.assert_frame_equal(result, expected)
def test_fwf_colspecs_none(colspecs, exp_data):
    # see gh-7079
    data = """\
123456
456789
"""
    expected = DataFrame(exp_data)

    result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
    tm.assert_frame_equal(result, expected)
Beispiel #37
0
    def test_BytesIO_input(self):
        if not compat.PY3:
            raise nose.SkipTest(
                "Bytes-related test - only needs to work on Python 3")

        result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')),
                          widths=[2, 2],
                          encoding='utf8')
        expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
        tm.assert_frame_equal(result, expected)
Beispiel #38
0
    def test_skiprows_inference(self):
        # GH11256
        test = """
Text contained in the file header

DataCol1   DataCol2
     0.0        1.0
   101.6      956.1
""".strip()
        expected = read_csv(StringIO(test), skiprows=2, delim_whitespace=True)
        tm.assert_frame_equal(expected, read_fwf(StringIO(test), skiprows=2))
Beispiel #39
0
def test_names_and_infer_colspecs():
    # GH#45337
    data = """X   Y   Z
      959.0    345   22.2
    """
    result = read_fwf(StringIO(data),
                      skiprows=1,
                      usecols=[0, 2],
                      names=["a", "b"])
    expected = DataFrame({"a": [959.0], "b": 22.2})
    tm.assert_frame_equal(result, expected)
Beispiel #40
0
def test_fwf_compression(compression_only, infer):
    data = """1111111111
    2222222222
    3333333333""".strip()

    compression = compression_only
    extension = "gz" if compression == "gzip" else compression

    kwargs = dict(widths=[5, 5], names=["one", "two"])
    expected = read_fwf(StringIO(data), **kwargs)

    data = bytes(data, encoding="utf-8")

    with tm.ensure_clean(filename="tmp." + extension) as path:
        tm.write_to_compressed(compression, path, data)

        if infer is not None:
            kwargs["compression"] = "infer" if infer else compression

        result = read_fwf(path, **kwargs)
        tm.assert_frame_equal(result, expected)
Beispiel #41
0
def test_default_delimiter():
    header = None
    csv_data = """
a,bbb
cc,dd"""

    fwf_data = """
a \tbbb
cc\tdd """
    result = read_fwf(StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0])
    expected = read_csv(StringIO(csv_data), header=header)
    tm.assert_frame_equal(result, expected)
Beispiel #42
0
def test_fwf_comment(comment):
    data = """\
  1   2.   4  #hello world
  5  NaN  10.0
"""
    data = data.replace("#", comment)

    colspecs = [(0, 3), (4, 9), (9, 25)]
    expected = DataFrame([[1, 2.0, 4], [5, np.nan, 10.0]])

    result = read_fwf(StringIO(data), colspecs=colspecs, header=None, comment=comment)
    tm.assert_almost_equal(result, expected)
Beispiel #43
0
def test_len_colspecs_len_names_with_index_col(colspecs, names, widths,
                                               index_col, expected):
    # GH#40830
    data = """col1  col2  col3  col4
    bab   ba    2"""
    result = read_fwf(
        StringIO(data),
        colspecs=colspecs,
        names=names,
        widths=widths,
        index_col=index_col,
    )
    tm.assert_frame_equal(result, expected)
Beispiel #44
0
    def test_fwf(self):
        data_expected = """\
2011,58,360.242940,149.910199,11950.7
2011,59,444.953632,166.985655,11788.4
2011,60,364.136849,183.628767,11806.2
2011,61,413.836124,184.375703,11916.8
2011,62,502.953953,173.237159,12468.3
"""
        expected = read_csv(StringIO(data_expected),
                            engine='python',
                            header=None)

        data1 = """\
201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4
201160    364.136849   183.628767   11806.2
201161    413.836124   184.375703   11916.8
201162    502.953953   173.237159   12468.3
"""
        colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
        df = read_fwf(StringIO(data1), colspecs=colspecs, header=None)
        tm.assert_frame_equal(df, expected)

        data2 = """\
2011 58   360.242940   149.910199   11950.7
2011 59   444.953632   166.985655   11788.4
2011 60   364.136849   183.628767   11806.2
2011 61   413.836124   184.375703   11916.8
2011 62   502.953953   173.237159   12468.3
"""
        df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None)
        tm.assert_frame_equal(df, expected)

        # From Thomas Kluyver: apparently some non-space filler characters can
        # be seen, this is supported by specifying the 'delimiter' character:
        # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
        data3 = """\
201158~~~~360.242940~~~149.910199~~~11950.7
201159~~~~444.953632~~~166.985655~~~11788.4
201160~~~~364.136849~~~183.628767~~~11806.2
201161~~~~413.836124~~~184.375703~~~11916.8
201162~~~~502.953953~~~173.237159~~~12468.3
"""
        df = read_fwf(StringIO(data3),
                      colspecs=colspecs,
                      delimiter='~',
                      header=None)
        tm.assert_frame_equal(df, expected)

        with tm.assertRaisesRegexp(ValueError, "must specify only one of"):
            read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7])

        with tm.assertRaisesRegexp(ValueError, "Must specify either"):
            read_fwf(StringIO(data3), colspecs=None, widths=None)
Beispiel #45
0
    def test_skiprows_by_index_inference(self):
        test = """
To be skipped
Not  To  Be  Skipped
Once more to be skipped
123  34   8      123
456  78   9      456
""".strip()

        expected = read_csv(StringIO(test), skiprows=[0, 2],
                            delim_whitespace=True)
        tm.assert_frame_equal(expected, read_fwf(
            StringIO(test), skiprows=[0, 2]))
Beispiel #46
0
def test_skip_rows_and_n_rows():
    # GH#44021
    data = """a\tb
1\t a
2\t b
3\t c
4\t d
5\t e
6\t f
    """
    result = read_fwf(StringIO(data), nrows=4, skiprows=[2, 4])
    expected = DataFrame({"a": [1, 3, 5, 6], "b": ["a", "c", "e", "f"]})
    tm.assert_frame_equal(result, expected)
Beispiel #47
0
def test_skiprows_by_index_inference():
    data = """
To be skipped
Not  To  Be  Skipped
Once more to be skipped
123  34   8      123
456  78   9      456
""".strip()
    skiprows = [0, 2]
    expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

    result = read_fwf(StringIO(data), skiprows=skiprows)
    tm.assert_frame_equal(result, expected)
Beispiel #48
0
def test_skiprows_inference():
    # see gh-11256
    data = """
Text contained in the file header

DataCol1   DataCol2
     0.0        1.0
   101.6      956.1
""".strip()
    skiprows = 2
    expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

    result = read_fwf(StringIO(data), skiprows=skiprows)
    tm.assert_frame_equal(result, expected)
Beispiel #49
0
def test_fwf_thousands(thousands):
    data = """\
 1 2,334.0    5
10   13     10.
"""
    data = data.replace(",", thousands)

    colspecs = [(0, 3), (3, 11), (12, 16)]
    expected = DataFrame([[1, 2334.0, 5], [10, 13, 10.0]])

    result = read_fwf(
        StringIO(data), header=None, colspecs=colspecs, thousands=thousands
    )
    tm.assert_almost_equal(result, expected)
Beispiel #50
0
    def test_fwf_compression(self):
        try:
            import gzip
            import bz2
        except ImportError:
            raise nose.SkipTest("Need gzip and bz2 to run this test")

        data = """1111111111
        2222222222
        3333333333""".strip()
        widths = [5, 5]
        names = ['one', 'two']
        expected = read_fwf(StringIO(data), widths=widths, names=names)
        if compat.PY3:
            data = bytes(data, encoding='utf-8')
        comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)]
        for comp_name, compresser in comps:
            with tm.ensure_clean() as path:
                tmp = compresser(path, mode='wb')
                tmp.write(data)
                tmp.close()
                result = read_fwf(path, widths=widths, names=names,
                                  compression=comp_name)
                tm.assert_frame_equal(result, expected)
Beispiel #51
0
def test_whitespace_preservation():
    # see gh-16772
    header = None
    csv_data = """
 a ,bbb
 cc,dd """

    fwf_data = """
 a bbb
 ccdd """
    result = read_fwf(
        StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t"
    )
    expected = read_csv(StringIO(csv_data), header=header)
    tm.assert_frame_equal(result, expected)
Beispiel #52
0
    def test_default_delimiter(self):
        data_expected = """
a,bbb
cc,dd"""
        expected = read_csv(StringIO(data_expected), header=None)

        test_data = """
a \tbbb
cc\tdd """
        result = read_fwf(StringIO(test_data),
                          widths=[3, 3],
                          header=None,
                          skiprows=[0])

        tm.assert_frame_equal(result, expected)
Beispiel #53
0
def test_dtype(dtype):
    data = """ a    b    c
1    2    3.2
3    4    5.2
"""
    colspecs = [(0, 5), (5, 10), (10, None)]
    result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype)

    expected = pd.DataFrame(
        {"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"]
    )

    for col, dt in dtype.items():
        expected[col] = expected[col].astype(dt)

    tm.assert_frame_equal(result, expected)
Beispiel #54
0
def test_basic():
    data = """\
A         B            C            D
201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4
201160    364.136849   183.628767   11806.2
201161    413.836124   184.375703   11916.8
201162    502.953953   173.237159   12468.3
"""
    result = read_fwf(StringIO(data))
    expected = DataFrame([[201158, 360.242940, 149.910199, 11950.7],
                          [201159, 444.953632, 166.985655, 11788.4],
                          [201160, 364.136849, 183.628767, 11806.2],
                          [201161, 413.836124, 184.375703, 11916.8],
                          [201162, 502.953953, 173.237159, 12468.3]],
                         columns=["A", "B", "C", "D"])
    tm.assert_frame_equal(result, expected)
Beispiel #55
0
    def test_whitespace_preservation(self):
        # Addresses Issue #16772
        data_expected = """
 a ,bbb
 cc,dd """
        expected = read_csv(StringIO(data_expected), header=None)

        test_data = """
 a bbb
 ccdd """
        result = read_fwf(StringIO(test_data),
                          widths=[3, 3],
                          header=None,
                          skiprows=[0],
                          delimiter="\n\t")

        tm.assert_frame_equal(result, expected)
Beispiel #56
0
def test_binary_mode():
    """
    read_fwf supports opening files in binary mode.

    GH 18035.
    """
    data = """aas aas aas
bba bab b a"""
    df_reference = DataFrame(
        [["bba", "bab", "b a"]], columns=["aas", "aas.1", "aas.2"], index=[0]
    )
    with tm.ensure_clean() as path:
        Path(path).write_text(data)
        with open(path, "rb") as file:
            df = read_fwf(file)
            file.seek(0)
            tm.assert_frame_equal(df, df_reference)
Beispiel #57
0
def test_encoding_mmap(memory_map):
    """
    encoding should be working, even when using a memory-mapped file.

    GH 23254.
    """
    encoding = "iso8859_1"
    with tm.ensure_clean() as path:
        Path(path).write_bytes(" 1 A Ä 2\n".encode(encoding))
        df = read_fwf(
            path,
            header=None,
            widths=[2, 2, 2, 2],
            encoding=encoding,
            memory_map=memory_map,
        )
    df_reference = DataFrame([[1, "A", "Ä", 2]])
    tm.assert_frame_equal(df, df_reference)
Beispiel #58
0
def test_encoding_mmap(memory_map):
    """
    encoding should be working, even when using a memory-mapped file.

    GH 23254.
    """
    encoding = "iso8859_1"
    data = BytesIO(" 1 A Ä 2\n".encode(encoding))
    df = read_fwf(
        data,
        header=None,
        widths=[2, 2, 2, 2],
        encoding=encoding,
        memory_map=memory_map,
    )
    data.seek(0)
    df_reference = DataFrame([[1, "A", "Ä", 2]])
    tm.assert_frame_equal(df, df_reference)
Beispiel #59
0
def test_fwf_regression():
    # see gh-3594
    #
    # Turns out "T060" is parsable as a datetime slice!
    tz_list = [1, 10, 20, 30, 60, 80, 100]
    widths = [16] + [8] * len(tz_list)
    names = ["SST"] + [f"T{z:03d}" for z in tz_list[1:]]

    data = """  2009164202000   9.5403  9.4105  8.6571  7.8372  6.0612  5.8843  5.5192
2009164203000   9.5435  9.2010  8.6167  7.8176  6.0804  5.8728  5.4869
2009164204000   9.5873  9.1326  8.4694  7.5889  6.0422  5.8526  5.4657
2009164205000   9.5810  9.0896  8.4009  7.4652  6.0322  5.8189  5.4379
2009164210000   9.6034  9.0897  8.3822  7.4905  6.0908  5.7904  5.4039
"""

    result = read_fwf(
        StringIO(data),
        index_col=0,
        header=None,
        names=names,
        widths=widths,
        parse_dates=True,
        date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"),
    )
    expected = DataFrame(
        [
            [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192],
            [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869],
            [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657],
            [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379],
            [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039],
        ],
        index=DatetimeIndex(
            [
                "2009-06-13 20:20:00",
                "2009-06-13 20:30:00",
                "2009-06-13 20:40:00",
                "2009-06-13 20:50:00",
                "2009-06-13 21:00:00",
            ]
        ),
        columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"],
    )
    tm.assert_frame_equal(result, expected)
Beispiel #60
0
def test_widths_and_usecols():
    # GH#46580
    data = """0  1    n -0.4100.1
0  2    p  0.2 90.1
0  3    n -0.3140.4"""
    result = read_fwf(
        StringIO(data),
        header=None,
        usecols=(0, 1, 3),
        widths=(3, 5, 1, 5, 5),
        index_col=False,
        names=("c0", "c1", "c3"),
    )
    expected = DataFrame({
        "c0": 0,
        "c1": [1, 2, 3],
        "c3": [-0.4, 0.2, -0.3],
    })
    tm.assert_frame_equal(result, expected)