Beispiel #1
0
    def test_determine_dtype(self):
        # Dataframe should be string
        result = parse_bytesio(io.BytesIO(b'A\nB'), 'text/csv', None)
        self.assertTrue(all(result.dataframe.dtypes == object))

        # Dataframe should be category
        result = parse_bytesio(io.BytesIO(b'A;B;C\nD;E;F'), 'text/txt', None)
        self.assertTrue(all(result.dataframe.dtypes == 'category'))
Beispiel #2
0
    def test_txt_separator_detection(self):
        expected = ProcessResult(pandas.DataFrame({'A': ['B'], 'C': ['D']}))

        result = parse_bytesio(io.BytesIO(b'A;C\nB;D'), 'text/txt', 'utf-8')

        self.assertEqual(result, expected)

        result = parse_bytesio(io.BytesIO(b'A\tC\nB\tD'), 'text/txt', 'utf-8')

        self.assertEqual(result, expected)

        result = parse_bytesio(io.BytesIO(b'A,C\nB,D'), 'text/txt', 'utf-8')

        self.assertEqual(result, expected)
    def setUp(self):
        super(UploadFileViewTests, self).setUp()  # log in
        self.wfm = load_and_add_module('uploadfile')
        self.factory = APIRequestFactory()

        # Path through chardet encoding detection
        with open(mock_csv_path, 'rb') as iobytes:
            self.csv_table = parse_bytesio(iobytes, 'text/csv', None).dataframe

        with open(mock_xlsx_path, 'rb') as iobytes:
            self.xlsx_table = parse_bytesio(
                iobytes,
                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                None).dataframe
            sanitize_dataframe(self.xlsx_table)
Beispiel #4
0
 def test_excel(self):
     with open(
             os.path.join(os.path.dirname(__file__), '..', 'test_data',
                          'example.xls'), 'rb') as file:
         result = parse_bytesio(file, 'application/vnd.ms-excel', None)
     expected = ProcessResult(pd.DataFrame({'foo': [1, 2], 'bar': [2, 3]}))
     self.assertEqual(result, expected)
Beispiel #5
0
 def test_autodetect_charset_chunked(self):
     result = parse_bytesio(io.BytesIO(b'A\ncaf\xe9'), 'text/csv', None)
     expected = ProcessResult(
         pandas.DataFrame({
             'A': ['café']
         }).astype('category'))
     self.assertEqual(result, expected)
Beispiel #6
0
 def test_array_becomes_str(self):
     result = parse_bytesio(
         io.BytesIO("""[
         {"A": ["foo", "bar"]}
     ]""".encode('utf-8')), 'application/json')
     expected = pd.DataFrame({'A': ["['foo', 'bar']"]})
     assert_frame_equal(result.dataframe, expected)
Beispiel #7
0
 def test_object_becomes_str(self):
     result = parse_bytesio(
         io.BytesIO("""[
         {"A": {"foo":"bar"}}
     ]""".encode('utf-8')), 'application/json')
     expected = pd.DataFrame({'A': ["{'foo': 'bar'}"]})
     assert_frame_equal(result.dataframe, expected)
Beispiel #8
0
 def test_json_not_records(self):
     result = parse_bytesio(io.BytesIO(b'{"meta":{"foo":"bar"},"data":[]}'),
                            "application/json")
     expected = ProcessResult(error=(
         "Workbench cannot import this JSON file. The JSON file must "
         "be an Array of Objects for Workbench to import it."))
     self.assertEqual(result, expected)
Beispiel #9
0
 def test_csv_no_na_filter(self):
     """
     We override pandas' urge to turn 'NA' into `np.nan`
     """
     result = parse_bytesio(io.BytesIO(b"A;C\nB;NA"), "text/csv", "utf-8")
     expected = ProcessResult(pd.DataFrame({"A": ["B"], "C": ["NA"]}))
     self.assertEqual(result, expected)
Beispiel #10
0
 def test_parse_utf8_csv(self):
     result = parse_bytesio(io.BytesIO(b'A\ncaf\xc3\xa9'),
                            'text/csv', 'utf-8')
     expected = ProcessResult(
         pandas.DataFrame({'A': ['café']}).astype('category')
     )
     self.assertEqual(result, expected)
Beispiel #11
0
 def test_csv_no_na_filter(self):
     """
     We override pandas' urge to turn 'NA' into `np.nan`
     """
     result = parse_bytesio(io.BytesIO(b'A;C\nB;NA'), 'text/csv', 'utf-8')
     expected = ProcessResult(pd.DataFrame({'A': ['B'], 'C': ['NA']}))
     self.assertEqual(result, expected)
Beispiel #12
0
 def test_autodetect_charset_iso8859_1(self):
     # \xe9 is ISO-8859-1 so Workbench should auto-detect it
     result = parse_bytesio(io.BytesIO(b'A\ncaf\xe9'), 'text/csv', None)
     expected = ProcessResult(
         pandas.DataFrame({
             'A': ['café']
         }).astype('category'))
     self.assertEqual(result, expected)
Beispiel #13
0
 def test_json_int64(self):
     """Support int64 -- like Twitter IDs."""
     result = parse_bytesio(
         io.BytesIO("""[
         {"A": 1093943422262697985}
     ]""".encode('utf-8')), 'application/json')
     expected = pd.DataFrame({'A': [1093943422262697985]})
     assert_frame_equal(result.dataframe, expected)
Beispiel #14
0
 def test_json_with_int_nulls(self):
     result = parse_bytesio(
         io.BytesIO("""[
         {"A": 1},
         {"A": null}
     ]""".encode('utf-8')), 'application/json')
     expected = pd.DataFrame({'A': [1.0, np.nan]})
     assert_frame_equal(result.dataframe, expected)
Beispiel #15
0
 def test_autodetect_charset_windows_1252(self):
     # \x96 is - in windows-1252, does not exist in UTF-8 or ISO-8859-1
     result = parse_bytesio(io.BytesIO(b'A\n2000\x962018'),
                            'text/csv', None)
     expected = ProcessResult(
         pandas.DataFrame({'A': ['2000–2018']}).astype('category')
     )
     self.assertEqual(result, expected)
Beispiel #16
0
 def test_json_with_nulls(self):
     result = parse_bytesio(
         io.BytesIO("""[
         {"A": "a"},
         {"A": null}
     ]""".encode('utf-8')), 'application/json')
     expected = pd.DataFrame({'A': ['a', None]}, dtype=str)
     assert_frame_equal(result.dataframe, expected)
Beispiel #17
0
 def test_replace_invalid_utf8(self):
     # \xe9 is ISO-8859-1 and we select 'utf-8' to test Workbench's recovery
     result = parse_bytesio(io.BytesIO(b'A\ncaf\xe9'), 'text/csv', 'utf-8')
     expected = ProcessResult(
         pandas.DataFrame({
             'A': ['caf�']
         }).astype('category'))
     self.assertEqual(result, expected)
Beispiel #18
0
 def test_autodetect_charset_iso8859_1(self):
     # \xe9 is ISO-8859-1 so Workbench should auto-detect it
     result = parse_bytesio(io.BytesIO(b"A\ncaf\xe9"), "text/csv", None)
     expected = ProcessResult(
         pd.DataFrame({
             "A": ["café"]
         }).astype("category"))
     self.assertEqual(result, expected)
Beispiel #19
0
 def test_json_with_undefined(self):
     result = parse_bytesio(
         io.BytesIO("""[
         {"A": "a"},
         {"A": "aa", "B": "b"}
     ]""".encode('utf-8')), 'application/json')
     expected = pd.DataFrame({'A': ['a', 'aa'], 'B': [np.nan, 'b']})
     assert_frame_equal(result.dataframe, expected)
Beispiel #20
0
 def test_replace_invalid_utf8(self):
     # \xe9 is ISO-8859-1 and we select 'utf-8' to test Workbench's recovery
     result = parse_bytesio(io.BytesIO(b"A\ncaf\xe9"), "text/csv", "utf-8")
     expected = ProcessResult(
         pd.DataFrame({
             "A": ["caf�"]
         }).astype("category"))
     self.assertEqual(result, expected)
Beispiel #21
0
 def test_parse_utf8_csv(self):
     result = parse_bytesio(io.BytesIO(b"A\ncaf\xc3\xa9"), "text/csv",
                            "utf-8")
     expected = ProcessResult(
         pd.DataFrame({
             "A": ["café"]
         }).astype("category"))
     self.assertEqual(result, expected)
Beispiel #22
0
 def test_autodetect_charset_utf8(self):
     result = parse_bytesio(
         io.BytesIO(b"A\n\xE8\xB0\xA2\xE8\xB0\xA2\xE4\xBD\xA0"), "text/csv",
         None)
     expected = ProcessResult(
         pd.DataFrame({
             "A": ["谢谢你"]
         }).astype("category"))
     self.assertEqual(result, expected)
Beispiel #23
0
 def test_autodetect_charset_windows_1252(self):
     # \x96 is - in windows-1252, does not exist in UTF-8 or ISO-8859-1
     result = parse_bytesio(io.BytesIO(b"A\n2000\x962018"), "text/csv",
                            None)
     expected = ProcessResult(
         pd.DataFrame({
             "A": ["2000–2018"]
         }).astype("category"))
     self.assertEqual(result, expected)
Beispiel #24
0
 def test_json_str_numbers_are_str(self):
     """JSON input data speficies whether we're String and Number."""
     result = parse_bytesio(
         io.BytesIO("""[
         {"A": "1"},
         {"A": "2"}
     ]""".encode('utf-8')), 'application/json')
     expected = pd.DataFrame({'A': ['1', '2']})
     assert_frame_equal(result.dataframe, expected)
Beispiel #25
0
 def test_json_mixed_types_are_str(self):
     """Support int64 -- like Twitter IDs."""
     result = parse_bytesio(
         io.BytesIO("""[
         {"A": 1},
         {"A": "2"}
     ]""".encode('utf-8')), 'application/json')
     expected = pd.DataFrame({'A': ['1', '2']})
     assert_frame_equal(result.dataframe, expected)
Beispiel #26
0
 def test_json_str_dates_are_str(self):
     """JSON does not support dates."""
     result = parse_bytesio(
         io.BytesIO("""[
         {"date": "2019-02-20"},
         {"date": "2019-02-21"}
     ]""".encode('utf-8')), 'application/json')
     expected = pd.DataFrame({'date': ['2019-02-20', '2019-02-21']})
     assert_frame_equal(result.dataframe, expected)
Beispiel #27
0
 def test_autodetect_charset_utf8(self):
     result = parse_bytesio(
         io.BytesIO(b'A\n\xE8\xB0\xA2\xE8\xB0\xA2\xE4\xBD\xA0'), 'text/csv',
         None)
     expected = ProcessResult(
         pandas.DataFrame({
             'A': ['谢谢你']
         }).astype('category'))
     self.assertEqual(result, expected)
Beispiel #28
0
 def test_json_not_array(self):
     """Workbench requires Array of Object"""
     result = parse_bytesio(io.BytesIO(b'{"last_updated":"02/21/2019"}'),
                            'application/json')
     self.assertEqual(
         result,
         ProcessResult(error=(
             'Workbench cannot import this JSON file. The JSON file '
             'must be an Array of Objects for Workbench to import it.')))
Beispiel #29
0
 def test_json_with_nulls(self):
     result = parse_bytesio(
         io.BytesIO("""[
         {"A": "a"},
         {"A": null}
     ]""".encode('utf-8')), 'application/json')
     expected = ProcessResult(
         pandas.DataFrame({'A': ['a', None]}, dtype=str))
     self.assertEqual(result, expected)
Beispiel #30
0
 def test_json_with_undefined(self):
     result = parse_bytesio(io.BytesIO("""[
         {"A": "a"},
         {"A": "aa", "B": "b"}
     ]""".encode('utf-8')), 'application/json')
     expected = ProcessResult(
         pd.DataFrame({'A': ['a', 'aa'], 'B': [numpy.nan, 'b']}, dtype=str)
     )
     self.assertEqual(result, expected)