def test_determine_dtype(self): # Dataframe should be string result = parse_bytesio(io.BytesIO(b'A\nB'), 'text/csv', None) self.assertTrue(all(result.dataframe.dtypes == object)) # Dataframe should be category result = parse_bytesio(io.BytesIO(b'A;B;C\nD;E;F'), 'text/txt', None) self.assertTrue(all(result.dataframe.dtypes == 'category'))
def test_txt_separator_detection(self): expected = ProcessResult(pandas.DataFrame({'A': ['B'], 'C': ['D']})) result = parse_bytesio(io.BytesIO(b'A;C\nB;D'), 'text/txt', 'utf-8') self.assertEqual(result, expected) result = parse_bytesio(io.BytesIO(b'A\tC\nB\tD'), 'text/txt', 'utf-8') self.assertEqual(result, expected) result = parse_bytesio(io.BytesIO(b'A,C\nB,D'), 'text/txt', 'utf-8') self.assertEqual(result, expected)
def setUp(self): super(UploadFileViewTests, self).setUp() # log in self.wfm = load_and_add_module('uploadfile') self.factory = APIRequestFactory() # Path through chardet encoding detection with open(mock_csv_path, 'rb') as iobytes: self.csv_table = parse_bytesio(iobytes, 'text/csv', None).dataframe with open(mock_xlsx_path, 'rb') as iobytes: self.xlsx_table = parse_bytesio( iobytes, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', None).dataframe sanitize_dataframe(self.xlsx_table)
def test_excel(self): with open( os.path.join(os.path.dirname(__file__), '..', 'test_data', 'example.xls'), 'rb') as file: result = parse_bytesio(file, 'application/vnd.ms-excel', None) expected = ProcessResult(pd.DataFrame({'foo': [1, 2], 'bar': [2, 3]})) self.assertEqual(result, expected)
def test_autodetect_charset_chunked(self): result = parse_bytesio(io.BytesIO(b'A\ncaf\xe9'), 'text/csv', None) expected = ProcessResult( pandas.DataFrame({ 'A': ['café'] }).astype('category')) self.assertEqual(result, expected)
def test_array_becomes_str(self): result = parse_bytesio( io.BytesIO("""[ {"A": ["foo", "bar"]} ]""".encode('utf-8')), 'application/json') expected = pd.DataFrame({'A': ["['foo', 'bar']"]}) assert_frame_equal(result.dataframe, expected)
def test_object_becomes_str(self): result = parse_bytesio( io.BytesIO("""[ {"A": {"foo":"bar"}} ]""".encode('utf-8')), 'application/json') expected = pd.DataFrame({'A': ["{'foo': 'bar'}"]}) assert_frame_equal(result.dataframe, expected)
def test_json_not_records(self): result = parse_bytesio(io.BytesIO(b'{"meta":{"foo":"bar"},"data":[]}'), "application/json") expected = ProcessResult(error=( "Workbench cannot import this JSON file. The JSON file must " "be an Array of Objects for Workbench to import it.")) self.assertEqual(result, expected)
def test_csv_no_na_filter(self): """ We override pandas' urge to turn 'NA' into `np.nan` """ result = parse_bytesio(io.BytesIO(b"A;C\nB;NA"), "text/csv", "utf-8") expected = ProcessResult(pd.DataFrame({"A": ["B"], "C": ["NA"]})) self.assertEqual(result, expected)
def test_parse_utf8_csv(self): result = parse_bytesio(io.BytesIO(b'A\ncaf\xc3\xa9'), 'text/csv', 'utf-8') expected = ProcessResult( pandas.DataFrame({'A': ['café']}).astype('category') ) self.assertEqual(result, expected)
def test_csv_no_na_filter(self): """ We override pandas' urge to turn 'NA' into `np.nan` """ result = parse_bytesio(io.BytesIO(b'A;C\nB;NA'), 'text/csv', 'utf-8') expected = ProcessResult(pd.DataFrame({'A': ['B'], 'C': ['NA']})) self.assertEqual(result, expected)
def test_autodetect_charset_iso8859_1(self): # \xe9 is ISO-8859-1 so Workbench should auto-detect it result = parse_bytesio(io.BytesIO(b'A\ncaf\xe9'), 'text/csv', None) expected = ProcessResult( pandas.DataFrame({ 'A': ['café'] }).astype('category')) self.assertEqual(result, expected)
def test_json_int64(self): """Support int64 -- like Twitter IDs.""" result = parse_bytesio( io.BytesIO("""[ {"A": 1093943422262697985} ]""".encode('utf-8')), 'application/json') expected = pd.DataFrame({'A': [1093943422262697985]}) assert_frame_equal(result.dataframe, expected)
def test_json_with_int_nulls(self): result = parse_bytesio( io.BytesIO("""[ {"A": 1}, {"A": null} ]""".encode('utf-8')), 'application/json') expected = pd.DataFrame({'A': [1.0, np.nan]}) assert_frame_equal(result.dataframe, expected)
def test_autodetect_charset_windows_1252(self): # \x96 is - in windows-1252, does not exist in UTF-8 or ISO-8859-1 result = parse_bytesio(io.BytesIO(b'A\n2000\x962018'), 'text/csv', None) expected = ProcessResult( pandas.DataFrame({'A': ['2000–2018']}).astype('category') ) self.assertEqual(result, expected)
def test_json_with_nulls(self): result = parse_bytesio( io.BytesIO("""[ {"A": "a"}, {"A": null} ]""".encode('utf-8')), 'application/json') expected = pd.DataFrame({'A': ['a', None]}, dtype=str) assert_frame_equal(result.dataframe, expected)
def test_replace_invalid_utf8(self): # \xe9 is ISO-8859-1 and we select 'utf-8' to test Workbench's recovery result = parse_bytesio(io.BytesIO(b'A\ncaf\xe9'), 'text/csv', 'utf-8') expected = ProcessResult( pandas.DataFrame({ 'A': ['caf�'] }).astype('category')) self.assertEqual(result, expected)
def test_autodetect_charset_iso8859_1(self): # \xe9 is ISO-8859-1 so Workbench should auto-detect it result = parse_bytesio(io.BytesIO(b"A\ncaf\xe9"), "text/csv", None) expected = ProcessResult( pd.DataFrame({ "A": ["café"] }).astype("category")) self.assertEqual(result, expected)
def test_json_with_undefined(self): result = parse_bytesio( io.BytesIO("""[ {"A": "a"}, {"A": "aa", "B": "b"} ]""".encode('utf-8')), 'application/json') expected = pd.DataFrame({'A': ['a', 'aa'], 'B': [np.nan, 'b']}) assert_frame_equal(result.dataframe, expected)
def test_replace_invalid_utf8(self): # \xe9 is ISO-8859-1 and we select 'utf-8' to test Workbench's recovery result = parse_bytesio(io.BytesIO(b"A\ncaf\xe9"), "text/csv", "utf-8") expected = ProcessResult( pd.DataFrame({ "A": ["caf�"] }).astype("category")) self.assertEqual(result, expected)
def test_parse_utf8_csv(self): result = parse_bytesio(io.BytesIO(b"A\ncaf\xc3\xa9"), "text/csv", "utf-8") expected = ProcessResult( pd.DataFrame({ "A": ["café"] }).astype("category")) self.assertEqual(result, expected)
def test_autodetect_charset_utf8(self): result = parse_bytesio( io.BytesIO(b"A\n\xE8\xB0\xA2\xE8\xB0\xA2\xE4\xBD\xA0"), "text/csv", None) expected = ProcessResult( pd.DataFrame({ "A": ["谢谢你"] }).astype("category")) self.assertEqual(result, expected)
def test_autodetect_charset_windows_1252(self): # \x96 is - in windows-1252, does not exist in UTF-8 or ISO-8859-1 result = parse_bytesio(io.BytesIO(b"A\n2000\x962018"), "text/csv", None) expected = ProcessResult( pd.DataFrame({ "A": ["2000–2018"] }).astype("category")) self.assertEqual(result, expected)
def test_json_str_numbers_are_str(self): """JSON input data speficies whether we're String and Number.""" result = parse_bytesio( io.BytesIO("""[ {"A": "1"}, {"A": "2"} ]""".encode('utf-8')), 'application/json') expected = pd.DataFrame({'A': ['1', '2']}) assert_frame_equal(result.dataframe, expected)
def test_json_mixed_types_are_str(self): """Support int64 -- like Twitter IDs.""" result = parse_bytesio( io.BytesIO("""[ {"A": 1}, {"A": "2"} ]""".encode('utf-8')), 'application/json') expected = pd.DataFrame({'A': ['1', '2']}) assert_frame_equal(result.dataframe, expected)
def test_json_str_dates_are_str(self): """JSON does not support dates.""" result = parse_bytesio( io.BytesIO("""[ {"date": "2019-02-20"}, {"date": "2019-02-21"} ]""".encode('utf-8')), 'application/json') expected = pd.DataFrame({'date': ['2019-02-20', '2019-02-21']}) assert_frame_equal(result.dataframe, expected)
def test_autodetect_charset_utf8(self): result = parse_bytesio( io.BytesIO(b'A\n\xE8\xB0\xA2\xE8\xB0\xA2\xE4\xBD\xA0'), 'text/csv', None) expected = ProcessResult( pandas.DataFrame({ 'A': ['谢谢你'] }).astype('category')) self.assertEqual(result, expected)
def test_json_not_array(self): """Workbench requires Array of Object""" result = parse_bytesio(io.BytesIO(b'{"last_updated":"02/21/2019"}'), 'application/json') self.assertEqual( result, ProcessResult(error=( 'Workbench cannot import this JSON file. The JSON file ' 'must be an Array of Objects for Workbench to import it.')))
def test_json_with_nulls(self): result = parse_bytesio( io.BytesIO("""[ {"A": "a"}, {"A": null} ]""".encode('utf-8')), 'application/json') expected = ProcessResult( pandas.DataFrame({'A': ['a', None]}, dtype=str)) self.assertEqual(result, expected)
def test_json_with_undefined(self): result = parse_bytesio(io.BytesIO("""[ {"A": "a"}, {"A": "aa", "B": "b"} ]""".encode('utf-8')), 'application/json') expected = ProcessResult( pd.DataFrame({'A': ['a', 'aa'], 'B': [numpy.nan, 'b']}, dtype=str) ) self.assertEqual(result, expected)