コード例 #1
0
def _parse_tsv(bytesio: io.BytesIO, text_encoding: _TextEncoding) -> DataFrame:
    """Build a DataFrame or raise parse error.

    Peculiarities:

    * The file encoding defaults to UTF-8.
    * Data types. This is a CSV, so every value is a string ... _but_ we do the
      pandas default auto-detection.
    """
    dtype = _determine_dtype(bytesio)
    with _wrap_text(bytesio, text_encoding) as textio:
        data = pandas.read_table(textio, dtype=dtype)
        autocast_dtypes_in_place(data)
        return data
コード例 #2
0
def _parse_txt(bytesio: io.BytesIO, text_encoding: _TextEncoding) -> DataFrame:
    """
        Build a DataFrame from txt bytes or raise parse error.

        Peculiarities:

        * The file encoding defaults to UTF-8.
        * Data types. Call _detect_separator to determine separator
    """
    dtype = _determine_dtype(bytesio)
    with _wrap_text(bytesio, text_encoding) as textio:
        sep = _detect_separator(textio)
        data = pandas.read_table(textio, dtype=dtype, sep=sep)
        autocast_dtypes_in_place(data)
        return data
コード例 #3
0
def _parse_table(bytesio: io.BytesIO, sep: Optional[str],
                 text_encoding: _TextEncoding) -> DataFrame:
    with _wrap_text(bytesio, text_encoding) as textio:
        if not sep:
            sep = _detect_separator(textio)

        # Pandas CSV parser looks like this:
        #
        # 1. "Tokenize" the input stream: copy all its _data_ bytes into
        #    memory, and maintain arrays of "words" and "lines" pointers
        #    into that array.
        # 2. Determine list of columns
        # 3. Per-column, convert dtypes to array
        # 4. Smoosh arrays together into a pd.DataFrame.
        #
        # When `low_memory=True`, all this happens in a bigger loop, so
        # that the tokenized data structure is smaller.
        #
        # `low_memory=True` forces re-coding categories. That's `O(Ch * N
        # * Ca lg Ca)`, where Ch is number of column-chunks (9,000 * 60
        # in this case), N is number of records, Ch is number of chunks
        # (8, in this case), and Ca is the number of categories.
        #
        # This `rc11.txt` file has enormous `Ch`: 9,000 * 60 = 540,000.
        # `general.csv` (our 1.2GB file) is much smaller, at ~4,000, even
        # though it has 250x more rows. Pandas doesn't let us adjust
        # chunk size, and its heuristic is terrible for`rc11.txt`.
        #
        # Let's try `low_memory=False`. That makes the CPU cost
        # `O(N * Co * Ca lg Ca)`, where Co is the number of columns. Memory
        # usage grows by the number of cells. In the case of `general.csv`,
        # the cost is an extra 1GB.
        data = pandas.read_csv(textio,
                               dtype='category',
                               sep=sep,
                               na_filter=False,
                               low_memory=False)

        autocast_dtypes_in_place(data)
        return data
コード例 #4
0
def _parse_xlsx(bytesio: io.BytesIO, _unused: _TextEncoding) -> DataFrame:
    """
    Build a DataFrame from xlsx bytes or raise parse error.

    Peculiarities:

    * Error can be xlrd.XLRDError or pandas error
    * We read the entire file contents into memory before parsing
    """
    # dtype='category' crashes as of 2018-09-11
    try:
        # Use xlrd.open_workbook(): if we call pandas.read_excel(bytesio) it
        # will read the entire file into RAM.
        with tempfile.NamedTemporaryFile() as temp:
            shutil.copyfileobj(bytesio, temp)
            temp.flush()
            temp.seek(0)
            workbook = xlrd.open_workbook(temp.name)
            data = pandas.read_excel(workbook, engine='xlrd', dtype=object)
    except xlrd.XLRDError as err:
        return ProcessResult(error=f'Error reading Excel file: {str(err)}')

    autocast_dtypes_in_place(data)
    return data
コード例 #5
0
 def test_autocast_str_categories_from_str_categories(self):
     table = pd.DataFrame({'A': ['1', '2.1', 'Yay']}, dtype='category')
     autocast_dtypes_in_place(table)  # should be no-op
     expected = pd.DataFrame({'A': ['1', '2.1', 'Yay']}, dtype='category')
     assert_frame_equal(table, expected)
コード例 #6
0
 def test_autocast_int_from_str_categories_with_empty_str(self):
     table = pd.DataFrame({'A': ['', '', '1']}, dtype='category')
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({'A': [np.nan, np.nan, 1.0]}, dtype=np.float64)
     assert_frame_equal(table, expected)
コード例 #7
0
 def test_autocast_float_from_str_categories_with_dup_floats(self):
     table = pd.DataFrame({'A': ['1', '1.0']}, dtype='category')
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({'A': [1.0, 1.0]}, dtype=np.float64)
     assert_frame_equal(table, expected)
コード例 #8
0
 def test_autocast_float_from_str_categories_with_empty_str(self):
     # example: used read_csv(dtype='category'), now want floats
     table = pd.DataFrame({'A': ['1', '2.1', '']}, dtype='category')
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({'A': [1.0, 2.1, np.nan]}, dtype=np.float64)
     assert_frame_equal(table, expected)
コード例 #9
0
 def test_autocast_int_from_str_categories(self):
     # example: used read_csv(dtype='category'), now want ints
     table = pd.DataFrame({'A': ['1', '2']}, dtype='category')
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({'A': [1, 2]})
     assert_frame_equal(table, expected)
コード例 #10
0
 def test_autocast_int_from_str(self):
     table = pd.DataFrame({'A': ['1', '2']})
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({'A': [1, 2]})
     assert_frame_equal(table, expected)