Ejemplo n.º 1
0
 def parse_file(logger, reader_format: str, url, reader_options: dict) -> List:
     result = []
     if reader_format == "csv":
         # pandas.read_csv additional arguments can be passed to customize how to parse csv.
         # see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
         result.append(pd.read_csv(url, **reader_options))
     elif reader_format == "flat_json":
         # We can add option to call to pd.normalize_json to normalize semi-structured JSON data into a flat table
         # by asking user to specify how to flatten the nested columns
         result.append(pd.read_json(url, **reader_options))
     elif reader_format == "html":
         result += pd.read_html(url, **reader_options)
     elif reader_format == "excel":
         result.append(pd.read_excel(url, **reader_options))
     elif reader_format == "feather":
         result.append(pd.read_feather(url, **reader_options))
     elif reader_format == "parquet":
         result.append(pd.read_parquet(url, **reader_options))
     elif reader_format == "orc":
         result.append(pd.read_orc(url, **reader_options))
     elif reader_format == "pickle":
         result.append(pd.read_pickle(url, **reader_options))
     else:
         reason = f"Reader {reader_format} is not supported\n{traceback.format_exc()}"
         logger.error(reason)
         raise Exception(reason)
     return result
Ejemplo n.º 2
0
def test_orc_reader_boolean_type(datadir, orc_file):
    file_path = datadir / orc_file

    pdf = pd.read_orc(file_path)
    df = cudf.read_orc(file_path).to_pandas()

    assert_eq(pdf, df)
Ejemplo n.º 3
0
def test_orc_reader_basic(dirpath):
    data = {
        "boolean1":
        np.array([False, True], dtype="bool"),
        "byte1":
        np.array([1, 100], dtype="int8"),
        "short1":
        np.array([1024, 2048], dtype="int16"),
        "int1":
        np.array([65536, 65536], dtype="int32"),
        "long1":
        np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
        "float1":
        np.array([1.0, 2.0], dtype="float32"),
        "double1":
        np.array([-15.0, -5.0], dtype="float64"),
        "bytes1":
        np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
        "string1":
        np.array(["hi", "bye"], dtype="object"),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
    got = read_orc(inputfile, columns=data.keys())

    tm.assert_equal(expected, got)
Ejemplo n.º 4
0
def test_orc_reader_decimal(dirpath):
    from decimal import Decimal

    # Only testing the first 10 rows of area_data
    data = {
        "_col0":
        np.array(
            [
                Decimal("-1000.50000"),
                Decimal("-999.60000"),
                Decimal("-998.70000"),
                Decimal("-997.80000"),
                Decimal("-996.90000"),
                Decimal("-995.10000"),
                Decimal("-994.11000"),
                Decimal("-993.12000"),
                Decimal("-992.13000"),
                Decimal("-991.14000"),
            ],
            dtype="object",
        )
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)
Ejemplo n.º 5
0
def test_orc_reader_empty(dirpath):
    columns = [
        "boolean1",
        "byte1",
        "short1",
        "int1",
        "long1",
        "float1",
        "double1",
        "bytes1",
        "string1",
    ]
    dtypes = [
        "bool",
        "int8",
        "int16",
        "int32",
        "int64",
        "float32",
        "float64",
        "object",
        "object",
    ]
    expected = pd.DataFrame(index=pd.RangeIndex(0))
    for colname, dtype in zip(columns, dtypes):
        expected[colname] = pd.Series(dtype=dtype)

    inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
    got = read_orc(inputfile, columns=columns)

    tm.assert_equal(expected, got)
Ejemplo n.º 6
0
def test_orc_reader_multiple_files(datadir, num_rows):

    path = datadir / "TestOrcFile.testSnappy.orc"

    df_1 = pd.read_orc(path)
    df_2 = pd.read_orc(path)
    df = pd.concat([df_1, df_2], ignore_index=True)

    gdf = cudf.read_orc([path, path], num_rows=num_rows).to_pandas()

    # Slice rows out of the whole dataframe for comparison as PyArrow doesn't
    # have an API to read a subsection of rows from the file
    df = df[:num_rows]
    df = df.reset_index(drop=True)

    assert_eq(df, gdf)
Ejemplo n.º 7
0
def test_orc_timestamp_read(datadir):
    path = datadir / "TestOrcFile.timestamp.issue.orc"

    pdf = pd.read_orc(path)
    gdf = cudf.read_orc(path)

    assert_eq(pdf, gdf)
Ejemplo n.º 8
0
 def _read_orc_file(self):
     records = []
     for file in glob.glob(os.path.join(os.path.join(self.orc_dir_name, '**/*'))):
         df = pd.read_orc(file)
         for i in range(df.shape[0]):
             records.append(df.loc[i])
     return records
Ejemplo n.º 9
0
 def parse_file(logger, reader_format: str, url,
                reader_options: dict) -> List:
     result = []
     if reader_format == "csv":
         # pandas.read_csv additional arguments can be passed to customize how to parse csv.
         # see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
         result.append(pd.read_csv(url, **reader_options))
     elif reader_format == "json":
         result.append(pd.read_json(url, **reader_options))
     elif reader_format == "html":
         result += pd.read_html(url, **reader_options)
     elif reader_format == "excel":
         result.append(pd.read_excel(url, **reader_options))
     elif reader_format == "feather":
         result.append(pd.read_feather(url, **reader_options))
     elif reader_format == "parquet":
         result.append(pd.read_parquet(url, **reader_options))
     elif reader_format == "orc":
         result.append(pd.read_orc(url, **reader_options))
     elif reader_format == "pickle":
         result.append(pd.read_pickle(url, **reader_options))
     else:
         reason = f"Reader {reader_format} is not supported\n{traceback.format_exc()}"
         logger.error(reason)
         raise Exception(reason)
     return result
Ejemplo n.º 10
0
Archivo: io.py Proyecto: prutskov/modin
def read_orc(path,
             columns: Optional[List[str]] = None,
             **kwargs) -> DataFrame:  # noqa: PR01, RT01, D200
    """
    Load an ORC object from the file path, returning a DataFrame.
    """
    ErrorMessage.default_to_pandas("read_orc")
    Engine.subscribe(_update_engine)
    return DataFrame(pandas.read_orc(path, columns, **kwargs))
Ejemplo n.º 11
0
def test_orc_reader_decimal_type(datadir, orc_file):
    file_path = datadir / orc_file
    pdf = pd.read_orc(file_path)
    df = cudf.read_orc(file_path).to_pandas()
    # Converting to strings since pandas keeps it in decimal
    pdf["col8"] = pdf["col8"].astype("str")
    df["col8"] = df["col8"].astype("str")

    assert_eq(pdf, df)
Ejemplo n.º 12
0
def test_orc_writer_decimal(tmpdir, scale):
    np.random.seed(0)
    fname = tmpdir / "decimal.orc"

    expected = cudf.DataFrame({"dec_val": gen_rand_series("i", 100)})
    expected["dec_val"] = expected["dec_val"].astype(Decimal64Dtype(7, scale))

    expected.to_orc(fname)

    got = pd.read_orc(fname)
    assert_eq(expected.to_pandas()["dec_val"], got["dec_val"])
Ejemplo n.º 13
0
def test_empty_string_columns(data):
    buffer = BytesIO()

    expected = cudf.DataFrame({"string": data}, dtype="str")
    expected.to_orc(buffer)

    expected_pdf = pd.read_orc(buffer)
    got_df = cudf.read_orc(buffer)

    assert_eq(expected, got_df)
    assert_eq(expected_pdf, got_df)
Ejemplo n.º 14
0
def test_empty_dataframe():
    buffer = BytesIO()
    expected = cudf.DataFrame()
    expected.to_orc(buffer)

    # Raise error if column name is mentioned, but it doesn't exist.
    with pytest.raises(RuntimeError):
        cudf.read_orc(buffer, columns=["a"])

    got_df = cudf.read_orc(buffer)
    expected_pdf = pd.read_orc(buffer)

    assert_eq(expected, got_df)
    assert_eq(expected_pdf, got_df)
Ejemplo n.º 15
0
def test_empty_string_columns(data):
    buffer = BytesIO()

    expected = cudf.DataFrame({"string": data}, dtype="str")
    expected.to_orc(buffer)

    expected_pdf = pd.read_orc(buffer)
    got_df = cudf.read_orc(buffer)

    assert_eq(expected, got_df)
    assert_eq(
        expected_pdf,
        got_df.to_pandas(nullable=True)
        if expected_pdf["string"].dtype == pd.StringDtype() else got_df,
    )
Ejemplo n.º 16
0
def test_orc_reader_snappy_compressed(dirpath):
    data = {
        "int1":
        np.array(
            [
                -1160101563,
                1181413113,
                2065821249,
                -267157795,
                172111193,
                1752363137,
                1406072123,
                1911809390,
                -1308542224,
                -467100286,
            ],
            dtype="int32",
        ),
        "string1":
        np.array(
            [
                "f50dcb8",
                "382fdaaa",
                "90758c6",
                "9e8caf3f",
                "ee97332b",
                "d634da1",
                "2bea4396",
                "d67d89e8",
                "ad71007e",
                "e8c82066",
            ],
            dtype="object",
        ),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)
Ejemplo n.º 17
0
def test_orc_reader_date_high(dirpath):
    data = {
        "time":
        np.array(
            [
                "2038-05-05 12:34:56.100000",
                "2038-05-05 12:34:56.100100",
                "2038-05-05 12:34:56.100200",
                "2038-05-05 12:34:56.100300",
                "2038-05-05 12:34:56.100400",
                "2038-05-05 12:34:56.100500",
                "2038-05-05 12:34:56.100600",
                "2038-05-05 12:34:56.100700",
                "2038-05-05 12:34:56.100800",
                "2038-05-05 12:34:56.100900",
            ],
            dtype="datetime64[ns]",
        ),
        "date":
        np.array(
            [
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
            ],
            dtype="object",
        ),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)
Ejemplo n.º 18
0
def read_orc(path: FilePathOrBuffer,
             columns: Optional[List[str]] = None,
             **kwargs) -> DataFrame:
    ErrorMessage.default_to_pandas("read_orc")
    return DataFrame(pandas.read_orc(path, columns, **kwargs))
Ejemplo n.º 19
0
def read_orc(data_fp):
    return pd.read_orc(data_fp)
Ejemplo n.º 20
0
        for j in range(num_ints):
            bits = np_arr[i, 32 * j:32 * j + 32]
            # int_val = int(''.join(map(str, arr)), base=2)
            int_val = 0
            for digit in bits:
                int_val = (int_val << 1) + digit
            out_sub[j] = int_val
        out[i, :] = out_sub
    return out


if __name__ == '__main__':
    basepath = "/run/media/sharwinbobde/ExtraStorage/2M-scaled-array-1.orc/"
    # we need a sample file to get the number of features.
    sample_file = "part-00012-7d53a446-d692-475a-853f-9e55ccc8e9fa-c000.snappy.orc"
    df = pd.read_orc(basepath + sample_file)
    df = df.rename(columns={"FeatureVector_all_features": "vec"})

    num_records = df.shape[0]
    num_features = len(df['vec'][0])
    print(num_features)
    print(f"num_records = {num_records}")
    print(f"num_features = {num_features}")

    LSH_NUM_BITS = int(2**13)

    LSH = LSHBias(feature_dim=num_features, bits=LSH_NUM_BITS)

    W = np.array(LSH.W, dtype=np.float32)
    b_gpu = gpuarray.to_gpu(W)  # reuse this every time
Ejemplo n.º 21
0
def test_orc_reader_multi_file_multi_stripe(datadir):

    path = datadir / "TestOrcFile.testStripeLevelStats.orc"
    gdf = cudf.read_orc([path, path], engine="cudf", stripes=[[0, 1], [2]])
    pdf = pd.read_orc(path)
    assert_eq(pdf, gdf)
Ejemplo n.º 22
0
Archivo: io.py Proyecto: atomicai/modin
def read_orc(path: FilePathOrBuffer,
             columns: Optional[List[str]] = None,
             **kwargs) -> DataFrame:  # noqa: D103
    ErrorMessage.default_to_pandas("read_orc")
    Engine.subscribe(_update_engine)
    return DataFrame(pandas.read_orc(path, columns, **kwargs))
Ejemplo n.º 23
0
 def read_orc(filepath: str, **kwargs) -> pd.DataFrame:
     return pd.read_orc(filepath, **kwargs)