コード例 #1
0
def test_usecols_csv():
    target = {
        'a': [1, 2, 3, 4, 5, 6],
        'b': [2, 3, 4, 5, 6, 7],
        'c': [3, 4, 5, 6, 7, 8],
        'd': [4, 5, 6, 7, 8, 9],
        'e': ['5', '6', '7', '8', '9', '0']
    }
    fp = io.BytesIO(
        b'a,b,c,d,e\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n4,5,6,7,8\n5,6,7,8,9\n6,7,8,9,0'
    )
    fp.seek(0)
    table = csv.read_csv(fp,
                         convert_options=csv.ConvertOptions(
                             column_types={
                                 'a': pa.int32(),
                                 'b': pa.int64(),
                                 'c': pa.int64(),
                                 'd': pa.int64(),
                                 'e': pa.string(),
                             }))
    assert table
    engine.importArrowTable("usecols", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML("select * from usecols")
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target
コード例 #2
0
ファイル: parquet.py プロジェクト: halasystems/boxball
def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        arrow_schema = pa.schema(get_fields(table))
        column_names = [name for name, dtype in get_fields(table)]

        read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000)
        parse_options = pcsv.ParseOptions(newlines_in_values=True)
        convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"],
                                              true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True)

        parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='zstd',
                                          version="2.0", use_dictionary=True)
        stream_reader = pcsv.open_csv(extract_file, read_options=read_options, parse_options=parse_options,
                                      convert_options=convert_options)
        for batch in stream_reader:
            table = pa.Table.from_batches([batch])
            parquet_writer.write_table(table)
        parquet_writer.close()
コード例 #3
0
 def scan_file(self, bucket, key, schema):
     logging.info(f"delim is {self.delimiter}")
     uri = f"{bucket}/{key}"
     s3fs = fs.S3FileSystem()
     # Run column order validation by opening and not reading anything.
     filestream = s3fs.open_input_stream(uri)
     parse_opts = csv.ParseOptions(delimiter=self.delimiter)
     reader = csv.open_csv(filestream, parse_options=parse_opts)
     for index, col in enumerate(reader.schema):
         if col.name != schema[index].name:
             msg = "column {} is out of order".format(col.name)
             raise ColumnOrderException(msg)
     # Run the rest of the validations.
     filestream = s3fs.open_input_stream(uri)
     opts = csv.ConvertOptions(column_types=schema)
     reader = csv.open_csv(filestream,
                           convert_options=opts,
                           parse_options=parse_opts)
     # Kind of a hack, but it works...if delim wrong, everything is read
     # as one column.
     if len(schema) > 1 and len(reader.schema) == 1:
         raise WrongDelimiterException()
     # Parse through the file, pyarrow will through exceptions
     # if there's invalid data.
     for batch in reader:
         # If primary key is a string, need to check the column
         # for empty strings.
         if schema.field(self.primary_key).type == "string":
             table = pyarrow.Table.from_batches([batch])
             for val in table[self.primary_key]:
                 if val.as_py() == "":
                     raise EmptyPrimaryKeyException()
コード例 #4
0
    def read_stores(self):
        s_cols = [
            'retailer_code', 'parent_code', 'fips_state_code',
            'fips_county_code', 'dma_code', 'store_zip3'
        ]
        # To reduce space -- update with dictionary arrays later
        store_convert = {
            'year': pa.uint16(),
            'dma_code': pa.uint16(),
            'retailer_code': pa.uint16(),
            'parent_code': pa.uint16(),
            'store_zip3': pa.uint16(),
            'fips_county_code': pa.uint16(),
            'fips_state_code': pa.uint8()
        }

        # Use pyarrow to read CSVs and parse using the dict -- we have to fix some types again later.
        tmp = pa.concat_tables([
            csv.read_csv(
                x,
                parse_options=csv.ParseOptions(delimiter='\t'),
                convert_options=csv.ConvertOptions(column_types=store_convert))
            for x in self.stores_dict.values()
        ]).to_pandas().rename(columns={'year': 'panel_year'})

        # some columns have blanks --fill with zero to avoid converting to floats(!)
        tmp.loc[:, s_cols] = tmp.loc[:, s_cols].fillna(0)

        # use the compressed types
        my_dict = {
            key: value
            for (key, value) in type_dict.items() if key in tmp.columns
        }
        self.stores_df = tmp.astype(my_dict)
        return
コード例 #5
0
    def _read_table_arrow(self, source: tp.BinaryIO,
                          schema: pa.Schema) -> pa.Table:

        try:

            read_options = pa_csv.ReadOptions()
            read_options.encoding = 'utf-8'
            read_options.use_threads = False

            parse_options = pa_csv.ParseOptions()
            parse_options.newlines_in_values = True

            convert_options = pa_csv.ConvertOptions()
            convert_options.include_columns = schema.names
            convert_options.column_types = {
                n: t
                for (n, t) in zip(schema.names, schema.types)
            }
            convert_options.strings_can_be_null = True
            convert_options.quoted_strings_can_be_null = False

            return pa_csv.read_csv(source, read_options, parse_options,
                                   convert_options)

        except pa.ArrowInvalid as e:
            err = f"CSV file decoding failed, content is garbled"
            self._log.exception(err)
            raise _ex.EDataCorruption(err) from e

        except pa.ArrowKeyError as e:
            err = f"CSV file decoding failed, one or more columns is missing"
            self._log.error(err)
            self._log.exception(str(e))
            raise _ex.EDataCorruption(err) from e
コード例 #6
0
ファイル: text.py プロジェクト: kandorm/CLINE
 def pa_convert_options(self):
     if self.convert_options is not None:
         convert_options = self.convert_options
     else:
         convert_options = pac.ConvertOptions(
             column_types=FEATURES.type,
         )
     return convert_options
コード例 #7
0
ファイル: dbapi.py プロジェクト: zy-github-user/pysqream
    def csv_to_table(self, csv_path, table_name, read = None, parse = None, convert = None, con = None, auto_infer = False):
        ' Pyarrow CSV reader documentation: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html '

        if not ARROW:
            return "Optional pyarrow dependency not found. To install: pip3 install pyarrow"
        
        sqream_to_pa = {
            'ftBool':     pa.bool_(),
            'ftUByte':    pa.uint8(),
            'ftShort':    pa.int16(),
            'ftInt':      pa.int32(),
            'ftLong':     pa.int64(),
            'ftFloat':    pa.float32(),
            'ftDouble':   pa.float64(),
            'ftDate':     pa.timestamp('ns'),
            'ftDateTime': pa.timestamp('ns'),
            'ftVarchar':  pa.string(),
            'ftBlob':     pa.utf8()
        }

        start = time.time()
        # Get table metadata
        con = con or self
        con.execute(f'select * from {table_name} where 1=0')
        
        # Map column names to pyarrow types and set Arrow's CSV parameters
        sqream_col_types = [col_type[0] for col_type in con.col_type_tups]
        column_types = zip(con.col_names, [sqream_to_pa[col_type[0]] for col_type in con.col_type_tups])
        read = read or csv.ReadOptions(column_names=con.col_names)
        parse = parse or csv.ParseOptions(delimiter='|')
        convert = convert or csv.ConvertOptions(column_types = None if auto_infer else column_types)
        
        # Read CSV to in-memory arrow format
        csv_arrow = csv.read_csv(csv_path, read_options=read, parse_options=parse, convert_options=convert).combine_chunks()
        num_chunks = len(csv_arrow[0].chunks)
        numpy_cols = []

        # For each column, get the numpy representation for quick packing 
        for col_type, col in zip(sqream_col_types, csv_arrow):
            # Only one chunk after combine_chunks()
            col = col.chunks[0]
            if col_type in  ('ftVarchar', 'ftBlob', 'ftDate', 'ftDateTime'):
                col = col.to_pandas()
            else:
                col = col.to_numpy()
            
            numpy_cols.append(col)
        
        print (f'total loading csv: {time.time()-start}')
        start = time.time()
        
        # Insert columns into SQream
        col_num = csv_arrow.shape[1]
        con.executemany(f'insert into {table_name} values ({"?,"*(col_num-1)}?)', numpy_cols)
        print (f'total inserting csv: {time.time()-start}')
コード例 #8
0
 def read_rms(self):
     self.rms_df = pa.concat_tables([
         csv.read_csv(
             fn,
             parse_options=csv.ParseOptions(delimiter='\t'),
             convert_options=csv.ConvertOptions(column_types={
                 'upc': pa.int64(),
                 'upc_ver_uc': pa.uint8()
             })) for fn in self.rms_dict.values()
     ]).to_pandas()
     return
コード例 #9
0
    def read_product(self,
                     keep_groups=None,
                     drop_groups=None,
                     keep_modules=None,
                     drop_modules=None):
        prod_cols = [
            'upc', 'upc_ver_uc', 'upc_descr', 'product_module_code',
            'product_module_descr', 'product_group_code',
            'product_group_descr', 'brand_code_uc', 'brand_descr', 'multi',
            'size1_code_uc', 'size1_amount', 'size1_units', 'dataset_found_uc',
            'size1_change_flag_uc'
        ]

        prod_dict = {
            'upc': pa.int64(),
            'upc_ver_uc': pa.int8(),
            'product_module_code': pa.uint16(),
            'brand_code_uc': pa.uint32(),
            'multi': pa.uint16(),
            'size1_code_uc': pa.uint16()
        }

        prod_df = csv.read_csv(self.product_file,
                               read_options=csv.ReadOptions(encoding='latin'),
                               parse_options=csv.ParseOptions(delimiter='\t'),
                               convert_options=csv.ConvertOptions(
                                   column_types=prod_dict,
                                   include_columns=prod_cols)).to_pandas()
        if keep_groups:
            prod_df = prod_df[prod_df['product_group_code'].isin(keep_groups)]
        if drop_groups:
            prod_df = prod_df[~prod_df['product_group_code'].isin(drop_groups)]
        if keep_modules:
            prod_df = prod_df[prod_df['product_module_code'].isin(
                keep_modules)]
        if drop_modules:
            prod_df = prod_df[~prod_df['product_module_code'].isin(drop_modules
                                                                   )]

        # dictionary encoding to save space
        prod_df['size1_units'] = prod_df['size1_units'].astype('category')
        prod_df['product_module_descr'] = prod_df[
            'product_module_descr'].astype('category')
        prod_df['product_group_code'] = prod_df['product_group_code'].astype(
            'category')

        # clean up product info
        prod_df['upc_descr'] = prod_df['upc_descr'].str.strip().str.replace(
            'RTE', '')
        prod_df['brand_descr'] = prod_df['brand_descr'].str.strip(
        ).str.replace('CTL BR', 'Private Label')
        self.prod_df = prod_df.copy()
        return
コード例 #10
0
    def _read_csv(self) -> "Table":
        import pyarrow as pa
        from pyarrow import csv

        if not self.output_location:
            raise ProgrammingError("OutputLocation is none or empty.")
        if not self.output_location.endswith((".csv", ".txt")):
            return pa.Table.from_pydict(dict())
        length = self._get_content_length()
        if length and self.output_location.endswith(".txt"):
            description = self.description if self.description else []
            column_names = [d[0] for d in description]
            read_opts = csv.ReadOptions(
                skip_rows=0,
                column_names=column_names,
                block_size=self._block_size,
                use_threads=True,
            )
            parse_opts = csv.ParseOptions(
                delimiter="\t",
                quote_char=False,
                double_quote=False,
                escape_char=False,
            )
        elif length and self.output_location.endswith(".csv"):
            read_opts = csv.ReadOptions(skip_rows=0,
                                        block_size=self._block_size,
                                        use_threads=True)
            parse_opts = csv.ParseOptions(
                delimiter=",",
                quote_char='"',
                double_quote=True,
                escape_char=False,
            )
        else:
            return pa.Table.from_pydict(dict())

        bucket, key = parse_output_location(self.output_location)
        try:
            return csv.read_csv(
                self._fs.open_input_stream(f"{bucket}/{key}"),
                read_options=read_opts,
                parse_options=parse_opts,
                convert_options=csv.ConvertOptions(
                    quoted_strings_can_be_null=False,
                    timestamp_parsers=self.timestamp_parsers,
                    column_types=self.column_types,
                ),
            )
        except Exception as e:
            _logger.exception(f"Failed to read {bucket}/{key}.")
            raise OperationalError(*e.args) from e
コード例 #11
0
ファイル: hash_bucket.py プロジェクト: amzn/amazon-ray
def read_delta_file_envelopes(
        annotated_delta_manifests: List[Dict[str, Any]],
        column_names: List[str],
        primary_keys: List[str],
        sort_keys: List[str],
        deltacat_storage=unimplemented_deltacat_storage) \
        -> Optional[List[Dict[str, Any]]]:

    tables_and_annotations = []
    columns_to_read = list(chain(primary_keys, sort_keys))
    for annotated_delta_manifest in annotated_delta_manifests:
        tables = deltacat_storage.download_delta_manifest(
            annotated_delta_manifest,
            file_reader_kwargs={
                CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.CSV.value]: {
                    pacsv.ReadOptions(column_names=column_names),
                    pacsv.ConvertOptions(include_columns=columns_to_read)
                },
                CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.PARQUET.value]: {
                    "columns": columns_to_read
                },
                CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.FEATHER.value]: {
                    "columns": columns_to_read
                },
            },
        )
        annotations = dma.get_annotations(annotated_delta_manifest)
        assert (
            len(tables) == len(annotations),
            f"Unexpected Error: Length of downloaded delta manifest tables "
            f"({len(tables)}) doesn't match the length of delta manifest "
            f"annotations ({len(annotations)}).")
        tables_and_annotations.append((tables, annotations))
    if not tables_and_annotations:
        return None

    delta_file_envelopes = []
    for tables, annotations in tables_and_annotations:
        for i in range(len(tables)):
            delta_file = delta_file_envelope.of(
                dma.get_annotation_stream_position(annotations[i]),
                dma.get_annotation_file_index(annotations[i]),
                dma.get_annotation_delta_type(annotations[i]),
                tables[i],
            )
            delta_file_envelopes.append(delta_file)
    return delta_file_envelopes
コード例 #12
0
def _convert_options_from_dict(**kwargs):
    """Returns the convert options for CSV.
    Returns:
        (object) A pyarrow ConvertOptions object.
    """
    return csv.ConvertOptions(
        check_utf8=kwargs.pop('check_utf8', True),
        column_types=kwargs.pop('column_types', None),
        null_values=kwargs.pop('null_values', [" "]),
        true_values=kwargs.pop('true_values', None),
        false_values=kwargs.pop('false_values', None),
        strings_can_be_null=kwargs.pop('strings_can_be_null', True),
        auto_dict_encode=kwargs.pop('auto_dict_encode', None),
        auto_dict_max_cardinality=kwargs.pop('auto_dict_max_cardinality',
                                             None),
        include_columns=kwargs.pop('include_columns', None),
        include_missing_columns=kwargs.pop('include_missing_columns', None))
コード例 #13
0
def as_parquet(csv_file, metadata_file):

    result = io.BytesIO()
    csv_parse_options = csv.ParseOptions(delimiter="\t")
    schema_parse_options = csv.ParseOptions(delimiter="|")
    schema = csv.read_csv(metadata_file, parse_options=schema_parse_options)
    pyarrow_schema = create_pyarrow_schema(schema["cleaned_name"],
                                           schema["data_type"])
    csv_convert_options = csv.ConvertOptions(column_types=pyarrow_schema)
    parquet_table = csv.read_csv(csv_file,
                                 parse_options=csv_parse_options,
                                 convert_options=csv_convert_options)
    w = parquet.ParquetWriter(where=result,
                              schema=pyarrow_schema,
                              compression="SNAPPY",
                              flavor="spark")
    w.write_table(parquet_table)
    return result
コード例 #14
0
ファイル: datamgr.py プロジェクト: cpcloud/ibis
def read_tables(
    names: Iterable[str],
    data_directory: Path,
) -> Iterator[tuple[str, pa.Table]]:
    import pyarrow.csv as pac

    import ibis.backends.pyarrow.datatypes as pa_dt

    for name in names:
        schema = TEST_TABLES[name]
        convert_options = pac.ConvertOptions(column_types={
            name: pa_dt.to_pyarrow_type(type)
            for name, type in schema.items()
        })
        yield name, pac.read_csv(
            data_directory / f'{name}.csv',
            convert_options=convert_options,
        )
コード例 #15
0
def test_null_col():
    target = {'a': [1, 2, 3], 'b': [1, 2, 3], 'c': [None, None, None]}
    fp = io.BytesIO(b'a,b,c\n1,1,\n2,2,\n3,3,\n')
    fp.seek(0)
    table = csv.read_csv(fp,
                         convert_options=csv.ConvertOptions(column_types={
                             'a': pa.int32(),
                             'b': pa.int64(),
                             'c': pa.int64(),
                         }))
    assert table
    engine.importArrowTable("test_null_col", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML("select * from test_null_col")
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target
コード例 #16
0
ファイル: scdata.py プロジェクト: das-projects/singlecell
    def read_csv(self, filenames, delimiter=','):
        global parquet_writer
        for file in filenames:
            csv_reader = csv.open_csv(
                file,
                read_options=csv.ReadOptions(use_threads=True),
                parse_options=csv.ParseOptions(delimiter=delimiter),
                convert_options=csv.ConvertOptions(column_types=self.dtype))
            parquet_writer = pq.ParquetWriter(self.parquet_file,
                                              csv_reader.schema)

            nrow = 0
            for batch in csv_reader:
                batch_df = batch.to_pandas()
                nrow += batch_df.shape[0]
                parquet_writer.write_table(pa.Table.from_pandas(df=batch_df))

        parquet_writer.close()
        return ds.dataset(self.parquet_file, format="parquet")
def test_csv_options(in_type, pd_old_type, pd_new_type):
    schema = pa.schema([("string_col", pa.string())])

    read_options = csv.ReadOptions(skip_rows=1)

    parse_options = csv.ParseOptions(quote_char="'",
                                     escape_char="\\",
                                     delimiter=";",
                                     newlines_in_values=True)

    convert_options = csv.ConvertOptions(
        include_columns=["i", "my_string", "nonexistent_column"],
        include_missing_columns=True,
        null_values=["NULL_STRING"],
        strings_can_be_null=True,
    )

    df = pa_read_csv_to_pandas(
        "tests/data/csv_options_test.csv",
        schema,
        False,
        pd_string=False,
        parse_options=parse_options,
        convert_options=convert_options,
        read_options=read_options,
    )

    expected = [
        "dsfasd;dsffadsf",
        "dsfasd;dsffadsf",
        None,
        "this text\nhas a line break",
        "this text, like so, has commas",
    ]
    assert df.columns.tolist() == ["i", "my_string", "nonexistent_column"]
    assert df["nonexistent_column"].isnull().all()
    assert_series_equal(df["my_string"], Series(expected, name="my_string"))
コード例 #18
0
 def read_one_sales(fn, stores_list=None, incl_promo=True):
     my_cols = [
         'store_code_uc', 'upc', 'week_end', 'units', 'prmult', 'price'
     ]
     if incl_promo:
         my_cols = my_cols + ['feature', 'display']
     convert_dict = {
         'feature': pa.int8(),
         'display': pa.int8(),
         'prmult': pa.int8(),
         'units': pa.uint16(),
         'store_code_uc': pa.uint32()
     }
     dataset = ds.dataset(
         csv.read_csv(fn,
                      parse_options=csv.ParseOptions(delimiter='\t'),
                      convert_options=csv.ConvertOptions(
                          column_types=convert_dict,
                          include_columns=my_cols)))
     if stores_list is None:
         return dataset.to_table()
     else:
         return dataset.to_table(
             filter=ds.field('store_code_uc').isin(stores_list))
import pyarrow as pa
import pyarrow.csv as pv
import pyarrow.parquet as pq
from pyarrow.lib import Table

csv = 'accumulated_data_300_million_rows_id_filter.csv'
target_file = '../data/accumulated_data_300_million_rows_id_filter_1mill.parquet'

csv_read_options = pv.ReadOptions(skip_rows=0,
                                  encoding="utf8",
                                  column_names=["unit_id"])

# Types: https://arrow.apache.org/docs/python/api/datatypes.html
data_schema = pa.schema([('unit_id', pa.uint64())])

# ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions
csv_convert_options = pv.ConvertOptions(column_types=data_schema)

table: Table = pv.read_csv(input_file=csv,
                           read_options=csv_read_options,
                           convert_options=csv_convert_options)
pq.write_table(table, target_file)

print('Generated file with the following:')
print('Parquet metadata: ' + str(pq.read_metadata(target_file)))
print('Parquet schema: ' + pq.read_schema(target_file).to_string())
コード例 #20
0
def csv_reader_get_pyarrow_convert_options(names, usecols, dtype, parse_dates):

    include_columns = None  # default value (include all CSV columns)

    # if names is not given then column names will be defined from from the first row of CSV file
    # otherwise pyarrow autogenerated column names will be used (see ReadOptions), so
    # map pandas usecols to pyarrow include_columns accordingly
    if usecols:
        if type(usecols[0]) == str:
            if names:
                include_columns = [f'f{names.index(col)}' for col in usecols]
            else:
                include_columns = usecols  # no autogenerated names
        elif type(usecols[0]) == int:
            include_columns = [f'f{i}' for i in usecols]
        else:
            assert False, f"Failed building pyarrow ConvertOptions due to usecols param value: {usecols}"

    if dtype:
        # dtype pandas read_csv argument maps to pyarrow column_types dict, but column names
        # must match those that are read from CSV (if names is None) or pyarrows generated names (otherwise)
        if isinstance(dtype, dict):
            if names:
                names_list = list(names)
                column_types = {}
                for k, v in dtype.items():
                    # TO-DO: check this is aligned with include_columns
                    column_name = "f{}".format(names_list.index(k))
                    if isinstance(v, pd.CategoricalDtype):
                        column_type = pa.string()
                    else:
                        column_type = pa.from_numpy_dtype(v)
                    column_types[column_name] = column_type

            else:
                column_types = {k: pa.from_numpy_dtype(v) for k, v in dtype.items()}

        else:  # single dtype for all columns
            pa_dtype = pa.from_numpy_dtype(dtype)
            if names:
                column_types = {f"f{names_list.index(k)}": pa_dtype for k in names}
            elif usecols:
                column_types = dict.fromkeys(usecols, pa_dtype)
            else:
                column_types = pa_dtype
    else:
        column_types = None

    # TO-DO: support all possible parse_dates values (now only list of column positions is supported)
    try:
        for column in parse_dates:
            name = f"f{column}"
            # starting from pyarrow=3.0.0 strings are parsed to DateType (converted back to 'object'
            # when using to_pandas), but not TimestampType (that is used to represent np.datetime64)
            # see: pyarrow.from_numpy_dtype(np.datetime64('NaT', 's'))
            # so make pyarrow infer needed type manually
            column_types[name] = pa.timestamp('s')
    except (KeyError, TypeError):
        pass

    convert_options = csv.ConvertOptions(
        column_types=column_types,
        strings_can_be_null=True,
        include_columns=include_columns,
    )
    return convert_options
コード例 #21
0
import os
import sys

import pyarrow as pa
from pyarrow import csv
import pyarrow.plasma as plasma

csv_name = sys.argv[1]
client = plasma.connect("/tmp/fast_python")

convert_options = csv.ConvertOptions(column_types={"VendorID": pa.bool_()},
                                     true_values=["Y", "1"],
                                     false_values=["N", "2"])
table = csv.read_csv(csv_name, convert_options=convert_options)

pid = os.getpid()

plid = plasma.ObjectID(f"csv-{pid}".ljust(20, " ").encode("us-ascii"))

a = client.put(table, plid)  # eviction
コード例 #22
0
ファイル: csv.py プロジェクト: letranloc/datasets-1
 def pa_convert_options(self):
     convert_options = self.convert_options or pac.ConvertOptions(
         column_types=self.features.type if self.
         features is not None else None)
     return convert_options
コード例 #23
0
# ver = df.__version__
ver = "7.0.0"
git = ""
task = "groupby"
solution = "datafusion"
fun = ".groupby"
cache = "TRUE"
on_disk = "FALSE"

data_name = os.environ["SRC_DATANAME"]
src_grp = os.path.join("data", data_name + ".csv")
print("loading dataset %s" % src_grp, flush=True)

data = pacsv.read_csv(
    src_grp, convert_options=pacsv.ConvertOptions(auto_dict_encode=True))
print("dataset loaded")

ctx = df.ExecutionContext()
ctx.register_record_batches("x", [data.to_batches()])
print("registered record batches")
# cols = ctx.sql("SHOW columns from x")
# ans.show()

in_rows = data.num_rows
# print(in_rows, flush=True)

task_init = timeit.default_timer()

question = "sum v1 by id1"  # q1
gc.collect()
コード例 #24
0
import pyarrow as pa
from pyarrow import csv
import pyarrow.compute as pc
import pyarrow.parquet as pq

table = csv.read_csv(
    "../../07-pandas/sec1-intro/yellow_tripdata_2020-01.csv.gz")
pq.write_table(table, "202001.parquet")

convert_options = csv.ConvertOptions(column_types={
    "VendorID": pa.uint8(),
    "passenger_count": pa.uint8(),
})

table = csv.read_csv(
    "../../07-pandas/sec1-intro/yellow_tripdata_2020-01.csv.gz",
    convert_options=convert_options)
# vvv read_csv doesn't support this
table["VendorID"]

tp = table.to_pandas()

# pq.write_table(table, "202001.parquet", use_dictionary=False)
pq.write_table(
    table,
    "202001.parquet",
)

#memory size, disk size
#retype as int64 for both
#memory size, disk size
コード例 #25
0
csv_parse_options = pv.ParseOptions(delimiter=';')

# Types: https://arrow.apache.org/docs/python/api/datatypes.html
# TODO nullable parameter does not work as expected!
data_schema = pa.schema([
    pa.field(name='start_year', type=pa.string(), nullable=True),
    pa.field(name='unit_id', type=pa.uint64(), nullable=False),
    pa.field(name='value', type=pa.string(), nullable=False),
    pa.field(name='start_epoch_days', type=pa.int16(), nullable=True),
    pa.field(name='stop_epoch_days', type=pa.int16(), nullable=True),
])

# ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions
csv_convert_options = pv.ConvertOptions(column_types=data_schema,
                                        include_columns=[
                                            "unit_id", "value", "start_year",
                                            "start_unix_days", "stop_unix_days"
                                        ])

# read_csv: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html#pyarrow.csv.read_csv
table = pv.read_csv(input_file=csv_filename,
                    read_options=csv_read_options,
                    parse_options=csv_parse_options,
                    convert_options=csv_convert_options)

print(table.nbytes)
print(table.num_rows)
print(table.schema)
print(table.column_names)
pandas.set_option('max_columns', None)  # print all columns
print(table.to_pandas().head(10))
コード例 #26
0
 def pa_convert_options(self):
     convert_options = self.convert_options or pac.ConvertOptions()
     return convert_options
コード例 #27
0
    def read_year(self,
                  year,
                  hh_states_keep=None,
                  hh_states_drop=None,
                  hh_dma_keep=None,
                  hh_dma_drop=None):

        (purch_fn, trip_fn, panelist_fn) = get_fns(self.annual_dict[year])

        hh_ds = ds.dataset(
            csv.read_csv(panelist_fn,
                         parse_options=csv.ParseOptions(delimiter='\t'),
                         convert_options=csv.ConvertOptions(
                             auto_dict_encode=True,
                             auto_dict_max_cardinality=1024)))

        # build an arrow dataset filter object one by one
        my_filter = ds.field('Projection_Factor') > 0
        if hh_states_keep:
            my_filter = my_filter & (
                ds.field('Fips_State_Desc').isin(hh_states_keep))
        if hh_states_drop:
            my_filter = my_filter & (
                ~ds.field('Fips_State_Desc').isin(hh_states_drop))
        if hh_dma_keep:
            my_filter = my_filter & (ds.field('DMA_Cd').isin(hh_dma_keep))
        if hh_dma_drop:
            my_filter = my_filter & (~ds.field('DMA_Cd').isin(hh_dma_drop))

        # convert to pandas and get unique HH list
        hh_df = hh_ds.to_table(filter=my_filter).to_pandas().rename(
            columns=hh_dict_rename)
        hh_list = hh_df.household_code.unique()

        # use pyarrrow filter to filter trips for just our households
        trip_df = ds.dataset(csv.read_csv(trip_fn, parse_options=csv.ParseOptions(delimiter='\t')))\
                  .to_table(filter=ds.field('household_code').isin(hh_list)).to_pandas()

        trip_list = trip_df.trip_code_uc.unique()
        upc_list = self.prod_df.upc.unique()

        # use pyarrow to filter purchases using trips and UPCs only
        purch_ds = ds.dataset(
            csv.read_csv(purch_fn,
                         parse_options=csv.ParseOptions(delimiter='\t'),
                         convert_options=csv.ConvertOptions(
                             auto_dict_encode=True,
                             auto_dict_max_cardinality=1024)))
        purch_filter = ds.field('trip_code_uc').isin(trip_list) & ds.field(
            'upc').isin(upc_list)
        purch_df = purch_ds.to_table(filter=purch_filter).to_pandas()

        # Add the fields to the trips and purchases for convenience later
        trip_df2 = pd.merge(trip_df,
                            hh_df[self.hh_cols],
                            on=['household_code', 'panel_year'])
        purch_df2 = pd.merge(pd.merge(
            purch_df,
            self.prod_df[self.prod_cols], on=['upc', 'upc_ver_uc']),
            trip_df2[self.hh_cols+['trip_code_uc', 'purchase_date', 'store_code_uc']], on=['trip_code_uc'])\
            .rename(columns={'fips_state_desc': 'hh_state_desc'})

        self.purch_df = self.purch_df.append(purch_df2, ignore_index=True)
        self.trip_df = self.trip_df.append(trip_df2, ignore_index=True)
        self.hh_df = self.hh_df.append(hh_df, ignore_index=True)
        return
コード例 #28
0
import pyarrow as pa
from pyarrow import csv

if __name__ == "__main__":
    # read CSV
    CSV_NAME = "test.csv"
    # make sure bools are read correctly
    options = csv.ConvertOptions(column_types={
        "i64": pa.int64(),
        "bool": pa.bool_()
    })
    table = csv.read_csv(CSV_NAME, convert_options=options)
    # write table to arrow
    writer = pa.RecordBatchFileWriter("test2.arrow", table.schema)
    writer.write_table(table)
    writer.close()
コード例 #29
0
tot_bytes = 0
for name in table.column_names:
    col_bytes = table[name].nbytes
    col_type = table[name].type
    print(name, col_bytes // (1024**2))
    tot_bytes += col_bytes
print("Total", tot_bytes // (1024**2))

table["store_and_fwd_flag"].unique()

table_df = table.to_pandas()

convert_options = csv.ConvertOptions(
    column_types={
        "VendorID": pa.bool_(),
        # "trip_distance": pa.float16()
    },
    true_values=["Y", "1"],
    false_values=["N", "2"])
table = csv.read_csv("../sec1-intro/yellow_tripdata_2020-01.csv.gz",
                     convert_options=convert_options)
print(table["store_and_fwd_flag"].unique(),
      table["store_and_fwd_flag"].nbytes // (1024**2),
      table["VendorID"].nbytes // 1024,
      table["store_and_fwd_flag"].nbytes // 1024)

x = pa.array([False, True]).cast(pa.string()).cast(pa.bool_())

table_df = table.to_pandas()
print(table_df.store_and_fwd_flag)
mission_impossible = table.to_pandas(self_destruct=True)
コード例 #30
0
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df