Esempi in Python per dataset, esempi in Python per pyarrow.dataset.dataset

Esempio n. 1

0

Mostra file

File: test_dataset.py Progetto: xiepeini/arrow

def test_open_union_dataset_with_additional_kwargs(multisourcefs):
    child = ds.dataset('/plain', filesystem=multisourcefs, format='parquet')
    with pytest.raises(ValueError, match="cannot pass any additional"):
        ds.dataset([child], format="parquet")

Esempio n. 2

0

Mostra file

def load_flowsheet_dataset(path):
    dat = ds.dataset(path, format='parquet').to_table()
    return dat

Esempio n. 3

0

Mostra file

File: nielsen_read2.py Progetto: chrisconlon/kiltsnielsen

    def read_year(self,
                  year,
                  hh_states_keep=None,
                  hh_states_drop=None,
                  hh_dma_keep=None,
                  hh_dma_drop=None):

        (purch_fn, trip_fn, panelist_fn) = get_fns(self.annual_dict[year])

        hh_ds = ds.dataset(
            csv.read_csv(panelist_fn,
                         parse_options=csv.ParseOptions(delimiter='\t'),
                         convert_options=csv.ConvertOptions(
                             auto_dict_encode=True,
                             auto_dict_max_cardinality=1024)))

        # build an arrow dataset filter object one by one
        my_filter = ds.field('Projection_Factor') > 0
        if hh_states_keep:
            my_filter = my_filter & (
                ds.field('Fips_State_Desc').isin(hh_states_keep))
        if hh_states_drop:
            my_filter = my_filter & (
                ~ds.field('Fips_State_Desc').isin(hh_states_drop))
        if hh_dma_keep:
            my_filter = my_filter & (ds.field('DMA_Cd').isin(hh_dma_keep))
        if hh_dma_drop:
            my_filter = my_filter & (~ds.field('DMA_Cd').isin(hh_dma_drop))

        # convert to pandas and get unique HH list
        hh_df = hh_ds.to_table(filter=my_filter).to_pandas().rename(
            columns=hh_dict_rename)
        hh_list = hh_df.household_code.unique()

        # use pyarrrow filter to filter trips for just our households
        trip_df = ds.dataset(csv.read_csv(trip_fn, parse_options=csv.ParseOptions(delimiter='\t')))\
                  .to_table(filter=ds.field('household_code').isin(hh_list)).to_pandas()

        trip_list = trip_df.trip_code_uc.unique()
        upc_list = self.prod_df.upc.unique()

        # use pyarrow to filter purchases using trips and UPCs only
        purch_ds = ds.dataset(
            csv.read_csv(purch_fn,
                         parse_options=csv.ParseOptions(delimiter='\t'),
                         convert_options=csv.ConvertOptions(
                             auto_dict_encode=True,
                             auto_dict_max_cardinality=1024)))
        purch_filter = ds.field('trip_code_uc').isin(trip_list) & ds.field(
            'upc').isin(upc_list)
        purch_df = purch_ds.to_table(filter=purch_filter).to_pandas()

        # Add the fields to the trips and purchases for convenience later
        trip_df2 = pd.merge(trip_df,
                            hh_df[self.hh_cols],
                            on=['household_code', 'panel_year'])
        purch_df2 = pd.merge(pd.merge(
            purch_df,
            self.prod_df[self.prod_cols], on=['upc', 'upc_ver_uc']),
            trip_df2[self.hh_cols+['trip_code_uc', 'purchase_date', 'store_code_uc']], on=['trip_code_uc'])\
            .rename(columns={'fips_state_desc': 'hh_state_desc'})

        self.purch_df = self.purch_df.append(purch_df2, ignore_index=True)
        self.trip_df = self.trip_df.append(trip_df2, ignore_index=True)
        self.hh_df = self.hh_df.append(hh_df, ignore_index=True)
        return

Esempio n. 4

0

Mostra file

File: test_dataset.py Progetto: UCHI-DB/arrow

def test_open_dataset_unsupported_format(tempdir):
    _, path = _create_single_file(tempdir)
    with pytest.raises(ValueError, match="format 'blabla' is not supported"):
        ds.dataset([path], format="blabla")

Esempio n. 5

0

Mostra file

File: test_dataset.py Progetto: UCHI-DB/arrow

def test_open_dataset_validate_sources(tempdir):
    _, path = _create_single_file(tempdir)
    dataset = ds.dataset(path)
    with pytest.raises(ValueError,
                       match="Expected a path-like or Source, got"):
        ds.dataset([dataset])

Esempio n. 6

0

Mostra file

File: pq_coalesce.py Progetto: swapneil/suzieq

def coalesce_resource_table(infolder: str, outfolder: str, archive_folder: str,
                            table: str, state: SqCoalesceState) -> None:
    """This routine coalesces all the parquet data in the folder provided

    This function MUST be called with sqPoller as the table the first time to
    build the polling period sample. Without this, its not possible to compute
    the records to be written for a period accurately. The polling periods are
    computed when this function is called the first time with None as the
    state field. This function stuffs the sqPoller timeblocks as the polling
    period in the state block and returns it. The state object returned also
    has some statistics written such as number of files written, number of
    records written and so on.

    :param infolder: str, folder to read data in from
    :param outfolder: str, folder to write data to
    :param archive_folder: str, folder to store the archived files in
    :param table: str, name of table we're coalesceing
    :param state: SqCoalesceState, state about this coalesceion run
    :returns: Nothing
    """

    def compute_block_start(start):
        if state.period.total_seconds() < 24*3600:
            block_start = datetime(year=start.year, month=start.month,
                                   day=start.day, hour=start.hour,
                                   tzinfo=timezone.utc)
        elif 24*3600 <= state.period.total_seconds() < 24*3600*30:
            block_start = datetime(year=start.year, month=start.month,
                                   day=start.day, tzinfo=timezone.utc)
        elif 24*3600*30 <= state.period.total_seconds() < 24*3600*365:
            block_start = datetime(year=start.year, month=start.month,
                                   tzinfo=timezone.utc)
        else:
            block_start = datetime(year=start.year, tzinfo=timezone.utc)
        return block_start

    partition_cols = ['sqvers', 'namespace']
    dodel = True

    if table == "sqPoller":
        wr_polling_period = True
        state.poller_periods = set()
    else:
        wr_polling_period = False
    state.wrfile_count = 0
    state.wrrec_count = 0
    state.table_name = table
    schema = state.schema

    if state.schema.type == "record":
        state.keys = schema.key_fields()
        if state.current_df.empty:
            state.current_df = get_last_update_df(outfolder, state)

    # Ignore reading the compressed files
    dataset = ds.dataset(infolder, partitioning='hive', format='parquet',
                         ignore_prefixes=state.ign_pfx)

    state.logger.info(f'Examining {len(dataset.files)} {table} files '
                      f'for coalescing')
    fdf = get_file_timestamps(dataset.files)
    if fdf.empty:
        if (table == 'sqPoller') or (not state.poller_periods):
            return

    assert(len(dataset.files) == fdf.shape[0])
    polled_periods = sorted(state.poller_periods)
    if fdf.empty:
        state.logger.info(f'No updates for {table} to coalesce')
        start = polled_periods[0]
    else:
        start = fdf.timestamp.iloc[0]
    utcnow = datetime.now(timezone.utc)

    # We now need to determine if we're coalesceing a lot of data, at the start
    # or if we're only coalesceing for the last interval.
    if (utcnow < start):
        logging.error(
            'ERROR: Something is off, now is earlier than dates on files')
        return

    # We write data in fixed size 1 hour time blocks. Data from 10-11 is
    # written out as one block, data from 11-12 as another and so on.
    # Specifically, we write out 11:00:00 to 11:59:59 in the block
    block_start = compute_block_start(start)
    block_end = block_start + state.period

    # NOTE: You need the parentheses around the date comparison for some reason
    if (block_end > utcnow):
        return

    readblock = []
    wrfile_count = 0

    # We may start coalescing when nothing has changed for some initial period.
    # We have to write out records for that period.
    if schema.type == "record":
        for interval in polled_periods:
            if not fdf.empty and (block_end < interval):
                break
            pre_block_start = compute_block_start(interval)
            pre_block_end = pre_block_start + state.period
            write_files(readblock, infolder, outfolder, partition_cols,
                        state, pre_block_start, pre_block_end)

    for row in fdf.itertuples():
        if block_start <= row.timestamp < block_end:
            readblock.append(row.file)
            continue

        # Write data if either there's data to be written (readblock isn't
        # empty) OR this table is a record type and poller was alive during
        # this period (state's poller period for this window isn't blank
        if readblock or ((schema.type == "record") and
                         block_start in state.poller_periods):

            write_files(readblock, infolder, outfolder, partition_cols, state,
                        block_start, block_end)
            wrfile_count += len(readblock)
        if wr_polling_period and readblock:
            state.poller_periods.add(block_start)
        # Archive the saved files
        if readblock:
            archive_coalesced_files(readblock, archive_folder, state, dodel)

        # We have to find the timeslot where this record fits
        block_start = block_end
        block_end = block_start + state.period
        readblock = []
        if schema.type != "record":
            # We can jump directly to the timestamp corresonding to this
            # row's timestamp
            block_start = compute_block_start(row.timestamp)
            block_end = block_start + state.period
            if (row.timestamp > block_end) or (block_end > utcnow):
                break
            readblock = [row.file]
            continue

        while row.timestamp > block_end:
            if block_start in state.poller_periods:
                write_files(readblock, infolder, outfolder, partition_cols,
                            state, block_start, block_end)
                # Nothing to archive here, and we're not counting coalesced
                # records since these are duplicates
            block_start = block_end
            block_end = block_start + state.period
        if block_end > utcnow:
            break
        readblock = [row.file]

    # The last batch that ended before the block end
    if readblock or (fdf.empty and (schema.type == "record") and
                     block_start in state.poller_periods):
        write_files(readblock, infolder, outfolder, partition_cols, state,
                    block_start, block_end)
        wrfile_count += len(readblock)
        if wr_polling_period:
            state.poller_periods.add(block_start)
        archive_coalesced_files(readblock, archive_folder, state, dodel)

    state.wrfile_count = wrfile_count
    return

Esempio n. 7

0

Mostra file

def test_open_dataset_validate_sources(tempdir):
    _, path = _create_single_file(tempdir)
    dataset = ds.dataset(path)
    with pytest.raises(TypeError,
                       match="Dataset objects are currently not supported"):
        ds.dataset([dataset])

Esempio n. 8

0

Mostra file

    feather.write_feather(df, f'{name}.feather')
    sparse.save_npz(name+'.npz' , fingerprint_matrix)

    print(f'Job number {job_count} complete.')
    print(f'Job contained {len(smiles_list)} smiles strings')
    print(f'Job generated spares matrix with {len(row_idx)} row_idx')
    print(f'Job generated spares matrix with {len(col_idx)} col_idx')

columns = ['standard_smiles', \
           'canonical_id', \
           'docking_score'
           ]

# # source = '/data/dockop_data/AmpC_screen_table_clean.feather'
# reader = pa.ipc.open_file(filename)
# enumerate_list = [(index, element.to_table()) for index, element in enumerate(fragments)]



## Use the following for reading a larger partition of parquet files
dataset_path = pathlib.Path('/path/to/outfile.molchunk')
dataset = ds.dataset(dataset_path, format="feather")
fragments = [file for file in dataset.get_fragments()]

GB = 1024 ** 3
ray.init(num_cpus=20, _memory=32*GB, object_store_memory=32*GB)

futures = [fp_to_batch.remote(index, element.to_table()) for index, element in enumerate(fragments)]
results = [ray.get(f) for f in futures]

Esempio n. 9

0

Mostra file

File: enumerate_stereoismers.py Progetto: abazabaaa/molchunk_tools

        row["enumerated_smiles"] = smiles_list
        # row["onbits_fp"] =list(fp.GetOnBits())

        return row

    except ValueError:
        row["standard_smiles"] = 'dropped'
        row["selfies"] = 'dropped'
        row["inchi"] = 'dropped'
        row["inchikey"] = 'dropped'
        row["enumerated_smiles"] = list('dropped')
        return row


# Load the dataset from parquet one by one
dataset = ds.dataset(dataset_dir, format="parquet")

# Create a list of fragments that are not memory loaded
fragments = [file for file in dataset.get_fragments()]

for count, element in enumerate(fragments):
    #cast the fragment as a pandas df
    df_docked = element.to_table().to_pandas()
    #reset the index
    df_docked = df_docked.reset_index(drop=True)

    #now write the nearest neighbor name and smiles to the df
    smiles_column = 'Smile'
    df_add_nn = dm.parallelized(_preprocess,
                                list(df_docked.iterrows()),
                                arg_type='args',

Esempio n. 10

0

Mostra file

smi_path = '/cbica/home/grahamth/molchunktools/molchunk_tools/test/d3_chembl.smi'
smiles_column = 'f0'
canonical_id_column = 'f1'
activity_column = 'f2'
d3_df = ingest_chembl_smi(smi_path, smiles_column, canonical_id_column,
                          activity_column)

fingerprint_matrix_chembld3 = fingerprint_matrix_from_df(d3_df)

#define smiles and names for compounds in the matrix to be compared with
#this will be the key system for returning the nearest neighbor
smiles = list(d3_df['standard_smiles'])
name = list(d3_df['canonical_id'])

dataset_dir = '/cbica/home/grahamth/er_molchunk_dir'
dataset = ds.dataset(dataset_dir, format="feather")

output_dir = '/cbica/home/grahamth/d3fpsim'

# Create a list of fragments that are not memory loaded
fragments = [file for file in dataset.get_fragments()]

for count, element in enumerate(fragments):
    #cast the fragment as a pandas df
    df = element.to_table().to_pandas()
    #reset the index
    df = df.reset_index(drop=True)

    columns_to_keep = ['enumerated_smiles', 'CatalogID', 'ID_Index']
    df2 = df[columns_to_keep]
    df3 = df2.explode('enumerated_smiles')

Esempio n. 11

0

Mostra file

File: arrow_dataset_test.py Progetto: ajmal017/pandas-polygon

import pandas as pd
import pyarrow as pa
from pyarrow import fs
import pyarrow.dataset as ds


s3  = fs.S3FileSystem(
    access_key=environ['B2_ACCESS_KEY_ID'],
    secret_key=environ['B2_SECRET_ACCESS_KEY'],
    endpoint_override=environ['B2_ENDPOINT_URL']
)

dataset = ds.dataset(
    source='polygon-equities/data/trades',
    format='feather',
    filesystem=s3,
    partitioning='hive',
    exclude_invalid_files=True
)

df = dataset.to_table(
    # columns=['symbol', 'sip_epoch', 'price', 'size'],
    filter=ds.field('date') == '2020-07-01'
).to_pandas()


# local
dataset = ds.dataset(
    source='/Users/bobcolner/QuantClarity/data/trades/feather/',
    format='feather',
    partitioning='hive',

Esempio n. 12

0

Mostra file

File: test_dataset.py Progetto: xiepeini/arrow

def test_dataset_from_a_list_of_local_directories_raises(multisourcefs):
    msg = 'points to a directory, but only file paths are supported'
    with pytest.raises(IsADirectoryError, match=msg):
        ds.dataset(['/plain', '/schema', '/hive'], filesystem=multisourcefs)

Esempio n. 13

0

Mostra file

File: test_dataset.py Progetto: xiepeini/arrow

def test_union_dataset_from_other_datasets(tempdir, multisourcefs):
    child1 = ds.dataset('/plain', filesystem=multisourcefs, format='parquet')
    child2 = ds.dataset('/schema', filesystem=multisourcefs, format='parquet',
                        partitioning=['week', 'color'])
    child3 = ds.dataset('/hive', filesystem=multisourcefs, format='parquet',
                        partitioning='hive')

    assert child1.schema != child2.schema != child3.schema

    assembled = ds.dataset([child1, child2, child3])
    assert isinstance(assembled, ds.UnionDataset)

    msg = 'cannot pass any additional arguments'
    with pytest.raises(ValueError, match=msg):
        ds.dataset([child1, child2], filesystem=multisourcefs)

    expected_schema = pa.schema([
        ('date', pa.date32()),
        ('index', pa.int64()),
        ('value', pa.float64()),
        ('color', pa.string()),
        ('week', pa.int32()),
        ('year', pa.int32()),
        ('month', pa.int32()),
    ])
    assert assembled.schema.equals(expected_schema)
    assert assembled.to_table().schema.equals(expected_schema)

    assembled = ds.dataset([child1, child3])
    expected_schema = pa.schema([
        ('date', pa.date32()),
        ('index', pa.int64()),
        ('value', pa.float64()),
        ('color', pa.string()),
        ('year', pa.int32()),
        ('month', pa.int32()),
    ])
    assert assembled.schema.equals(expected_schema)
    assert assembled.to_table().schema.equals(expected_schema)

    expected_schema = pa.schema([
        ('month', pa.int32()),
        ('color', pa.string()),
        ('date', pa.date32()),
    ])
    assembled = ds.dataset([child1, child3], schema=expected_schema)
    assert assembled.to_table().schema.equals(expected_schema)

    expected_schema = pa.schema([
        ('month', pa.int32()),
        ('color', pa.string()),
        ('unkown', pa.string())  # fill with nulls
    ])
    assembled = ds.dataset([child1, child3], schema=expected_schema)
    assert assembled.to_table().schema.equals(expected_schema)

    # incompatible schemas, date and index columns have conflicting types
    table = pa.table([range(9), [0.] * 4 + [1.] * 5, 'abcdefghj'],
                     names=['date', 'value', 'index'])
    _, path = _create_single_file(tempdir, table=table)
    child4 = ds.dataset(path)

    with pytest.raises(pa.ArrowInvalid, match='Unable to merge'):
        ds.dataset([child1, child4])

Esempio n. 14

0

Mostra file

File: test_dataset.py Progetto: xiepeini/arrow

def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server):
    from pyarrow.fs import FileSystem
    import pyarrow.parquet as pq

    host, port, access_key, secret_key = s3_connection
    bucket = 'theirbucket'
    path = 'nested/folder/data.parquet'
    uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format(
        access_key, secret_key, bucket, path, host, port
    )

    fs, path = FileSystem.from_uri(uri)
    assert path == 'theirbucket/nested/folder/data.parquet'

    fs.create_dir(bucket)

    table = pa.table({'a': [1, 2, 3]})
    with fs.open_output_stream(path) as out:
        pq.write_table(table, out)

    # full string URI
    dataset = ds.dataset(uri, format="parquet")
    assert dataset.to_table().equals(table)

    # passing filesystem as an uri
    template = (
        "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format(
            access_key, secret_key, host, port
        )
    )
    cases = [
        ('theirbucket/nested/folder/', '/data.parquet'),
        ('theirbucket/nested/folder', 'data.parquet'),
        ('theirbucket/nested/', 'folder/data.parquet'),
        ('theirbucket/nested', 'folder/data.parquet'),
        ('theirbucket', '/nested/folder/data.parquet'),
        ('theirbucket', 'nested/folder/data.parquet'),
    ]
    for prefix, path in cases:
        uri = template.format(prefix)
        dataset = ds.dataset(path, filesystem=uri, format="parquet")
        assert dataset.to_table().equals(table)

    with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'):
        uri = template.format('/')
        ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri)

    error = (
        "The path component of the filesystem URI must point to a directory "
        "but it has a type: `{}`. The path component is `{}` and the given "
        "filesystem URI is `{}`"
    )

    path = 'theirbucket/doesnt/exist'
    uri = template.format(path)
    with pytest.raises(ValueError) as exc:
        ds.dataset('data.parquet', filesystem=uri)
    assert str(exc.value) == error.format('NotFound', path, uri)

    path = 'theirbucket/nested/folder/data.parquet'
    uri = template.format(path)
    with pytest.raises(ValueError) as exc:
        ds.dataset('data.parquet', filesystem=uri)
    assert str(exc.value) == error.format('File', path, uri)

Esempio n. 15

0

Mostra file

File: engine.py Progetto: spunxx/suzieq

    def get_table_df(self, cfg, **kwargs) -> pd.DataFrame:
        """Use Pandas instead of Spark to retrieve the data"""

        self.cfg = cfg

        table = kwargs.pop("table")
        start = kwargs.pop("start_time")
        end = kwargs.pop("end_time")
        view = kwargs.pop("view")
        fields = kwargs.pop("columns")
        addnl_filter = kwargs.pop("add_filter", None)
        key_fields = kwargs.pop("key_fields")
        merge_fields = kwargs.pop('merge_fields', {})

        folder = self._get_table_directory(table)

        if addnl_filter:
            # This is for special cases that are specific to an object
            query_str = addnl_filter
        else:
            query_str = None

        if query_str is None:
            # Make up a dummy query string to avoid if/then/else
            query_str = "timestamp != 0"

        # If sqvers is in the requested data, we've to handle it separately
        if 'sqvers' in fields:
            fields.remove('sqvers')
            need_sqvers = True
            max_vers = 0
        else:
            need_sqvers = False

        # If requesting a specific version of the data, handle that diff too
        sqvers = kwargs.pop('sqvers', None)
        try:
            dirs = Path(folder)
            datasets = []
            for elem in dirs.iterdir():
                # Additional processing around sqvers filtering and data
                if 'sqvers=' not in str(elem):
                    continue
                if sqvers and f'sqvers={sqvers}' != elem:
                    continue
                elif need_sqvers:
                    vers = float(str(elem).split('=')[-1])
                    if vers > max_vers:
                        max_vers = vers

                datasets.append(
                    ds.dataset(elem, format='parquet', partitioning='hive'))

            if not datasets:
                datasets = [
                    ds.dataset(folder, format='parquet', partitioning='hive')
                ]

            # Build the filters for predicate pushdown
            master_schema = self._build_master_schema(datasets)

            avail_fields = list(
                filter(lambda x: x in master_schema.names, fields))

            filters = self.build_ds_filters(start,
                                            end,
                                            master_schema,
                                            merge_fields=merge_fields,
                                            **kwargs)

            final_df = ds.dataset(datasets) \
                         .to_table(filter=filters, columns=avail_fields) \
                         .to_pandas(self_destruct=True) \
                         .query(query_str)

            if merge_fields:
                # These are key fields that need to be set right before we do
                # the drop duplicates to avoid missing out all the data
                for field in merge_fields:
                    newfld = merge_fields[field]
                    if (field in final_df.columns
                            and newfld in final_df.columns):
                        final_df[newfld] = np.where(final_df[newfld],
                                                    final_df[newfld],
                                                    final_df[field])
                    elif (field in final_df.columns
                          and newfld not in final_df.columns):
                        final_df.rename(columns={field: newfld}, inplace=True)

            if (not final_df.empty and (view == 'latest')
                    and all(x in final_df.columns for x in key_fields)):
                final_df = final_df.set_index(key_fields) \
                                   .sort_values(by='timestamp') \
                                   .query('~index.duplicated(keep="last")') \
                                   .reset_index()
        except (pa.lib.ArrowInvalid, OSError):
            return pd.DataFrame(columns=fields)

        fields = [x for x in final_df.columns if x in fields]
        if need_sqvers:
            final_df['sqvers'] = max_vers
            fields.insert(0, 'sqvers')

        return final_df[fields]

Esempio n. 16

0

Mostra file

def write_files(table: str, filelist: List[str], in_basedir: str,
                outfolder: str, partition_cols: List[str],
                state: SqCoalesceState, block_start, block_end) -> None:
    """Write the data from the list of files out as a single coalesced block

    We're fixing the compression in this function
    :param table: str, Name of the table for which we're writing the files
    :param filelist: List[str], list of files to write the data to
    :param in_basedir: str, base directory of the read files,
                       to get partition date
    :param outfolder: str, the outgoing folder to write the data to
    :param partition_cols: List[str], partition columns
    :param state: SqCoalesceState, coalescer state, for constructing filename
    :param block_start: dateime, starting time window of this coalescing block
    :param block_end: dateime, ending time window of this coalescing block
    :returns: Nothing
    """
    if not filelist and not state.schema.type == "record":
        return

    state.block_start = int(block_start.timestamp())
    state.block_end = int(block_end.timestamp())
    if filelist:
        this_df = ds.dataset(source=filelist, partitioning='hive',
                             partition_base_dir=in_basedir) \
            .to_table() \
            .to_pandas()
        state.wrrec_count += this_df.shape[0]

        if not this_df.empty:
            this_df = migrate_df(table, this_df, state.schema)

        if state.schema.type == "record":
            if not state.current_df.empty:
                this_df = this_df.set_index(state.keys)
                sett = set(this_df.index)
                setc = set(state.current_df.index)
                missing_set = setc.difference(sett)
                if missing_set:
                    missing_df = state.current_df.loc[missing_set]
                    this_df = pd.concat([this_df.reset_index(),
                                         missing_df.reset_index()])
                else:
                    this_df = this_df.reset_index()
    elif not state.current_df.empty:
        assert(state.schema.type == "record")
        this_df = state.current_df.reset_index()
    else:
        return

    this_df.sqvers = state.schema.version  # Updating the schema version
    state.dbeng.write(state.table_name, "pandas", this_df, True,
                      state.schema.get_arrow_schema(),
                      state.pq_file_name)

    if state.schema.type == "record" and filelist:
        # Now replace the old dataframe with this new set for "record" types
        # Non-record types should never have current_df non-empty
        state.current_df = this_df.set_index(state.keys) \
                                  .sort_values(by='timestamp') \
                                  .query('~index.duplicated(keep="last")')

Esempio n. 17

0

Mostra file

File: scratch.py Progetto: gravesee/scorecard

v.levels

v.expand(2)

x = np.random.choice(list("abdcde"), size=10000, replace=True)
v.to_index(pd.Series(x))
v.to_categorical(pd.Series(x))
v.to_sparse(pd.Series(x))



## cycler
import io
data = """
v1,v2,v3
1,2,3
4,5,6
7,8,9
"""

with open('test.csv', 'w') as fout:
    fout.write(data)

from pyarrow import csv
opts = csv.ConvertOptions
csv.read_csv('test.csv', )

import pyarrow.dataset as ds
for chunk in csv.open_csv('test.csv'):
    ds.dataset(chunk)

Esempio n. 18

0

Mostra file

File: bench_dask.py Progetto: JayjeetAtGithub/skyhook-perf-experiments

        filter_ = (ds.field("event") == 3749778)
    elif selectivity == "10":
        filter_ = (ds.field("total_amount") > 27)
    elif selectivity == "100":
        filter_ = None
    elif selectivity == "sm":
        filter_ = (ds.field("total_amount") > 300)
    elif selectivity == "smm":
        filter_ = (ds.field("total_amount") > 500)

    results = list()
    for i in range(iterations):
        e = os.system('./clean_cache.sh')
        if e != 0:
            print('failed to clean cache')
        dataset_ = ds.dataset(directory, format=format_)
        cols_ = dataset_.schema.names
        start = time.time()
        j = 0

        futures_list = list()
        for file in dataset_.files:
            future = client.submit(do_scan, file, cols_)
            futures_list.append(future)

        wait(futures_list)

        end = time.time()
        results.append(end - start)

    print(f"{fmt}_{selectivity} = ", results)

Esempio n. 19

0

Mostra file

def test_open_dataset_from_source_additional_kwargs(multisourcefs):
    child = ds.FileSystemDatasetFactory(multisourcefs,
                                        fs.FileSelector('/plain'),
                                        format=ds.ParquetFileFormat())
    with pytest.raises(ValueError, match="cannot pass any additional"):
        ds.dataset(child, format="parquet")

Esempio n. 20

0

Mostra file

    def read(self, table_name: str, data_format: str,
             **kwargs) -> pd.DataFrame:
        """Read the data specified from parquet files and return

        This function also implements predicate pushdown to filter the data
        as specified by the provided filters.

        :param table_name: str, the name of the table to be read
        :param data_format: str, Format the data's to be returned in,
                            (only pandas supported at this point)
        :param columns: List[str], list of columns requested to be read,
                        only those specified are returned, keyword arg
        :param key_fields: List[str], key fields for table, required to
                           deduplicate, keyword arg only
        :param view: str, one of ["latest", "all"], keyword arg only
        :param start: float, starting time window for data, timestamp,
                      can be 0 to indicate latest, keyword arg only
        :param end: float, ending time window for data, timestamp,
                    can be 0 to indicate latest, keyword arg only,
        :param kwargs: dict, the optional keyword arguments, addnl_filter,
                       and merge_fields, not needed typically
        :returns: pandas dataframe of the data specified, or None if
                  unsupported format
        :rtype: pd.DataFrame

        """

        if data_format not in self.supported_data_formats():
            return None

        start = kwargs.pop("start_time")
        end = kwargs.pop("end_time")
        view = kwargs.pop("view")
        fields = kwargs.pop("columns")
        key_fields = kwargs.pop("key_fields")
        addnl_filter = kwargs.pop("add_filter", None)
        merge_fields = kwargs.pop('merge_fields', {})

        folder = self._get_table_directory(table_name, False)

        if addnl_filter:
            # This is for special cases that are specific to an object
            query_str = addnl_filter
        else:
            query_str = None

        if query_str is None:
            # Make up a dummy query string to avoid if/then/else
            query_str = "timestamp != 0"

        # If sqvers is in the requested data, we've to handle it separately
        if 'sqvers' in fields:
            fields.remove('sqvers')
            need_sqvers = True
            max_vers = 0
        else:
            need_sqvers = False

        # If requesting a specific version of the data, handle that diff too
        sqvers = kwargs.pop('sqvers', None)
        datasets = []
        try:
            dirs = Path(folder)
            try:
                for elem in dirs.iterdir():
                    # Additional processing around sqvers filtering and data
                    if 'sqvers=' not in str(elem):
                        continue
                    if sqvers and f'sqvers={sqvers}' != elem:
                        continue
                    elif need_sqvers:
                        vers = float(str(elem).split('=')[-1])
                        if vers > max_vers:
                            max_vers = vers

                    datasets.append(
                        ds.dataset(elem, format='parquet',
                                   partitioning='hive'))
            except FileNotFoundError:
                pass
            except Exception as e:
                raise e

            # Now find the exact set of files we need to go over
            cp_dataset = self._get_cp_dataset(table_name, need_sqvers, sqvers,
                                              view, start, end)
            if cp_dataset:
                datasets.append(cp_dataset)

            if not datasets:
                datasets = [
                    ds.dataset(folder, format='parquet', partitioning='hive')
                ]

            # Build the filters for predicate pushdown
            master_schema = self._build_master_schema(datasets)

            avail_fields = list(
                filter(lambda x: x in master_schema.names, fields))

            filters = self.build_ds_filters(start,
                                            end,
                                            master_schema,
                                            merge_fields=merge_fields,
                                            **kwargs)

            final_df = ds.dataset(datasets) \
                .to_table(filter=filters, columns=avail_fields) \
                .to_pandas(self_destruct=True) \
                .query(query_str) \
                .sort_values(by='timestamp')

            if merge_fields:
                # These are key fields that need to be set right before we do
                # the drop duplicates to avoid missing out all the data
                for field in merge_fields:
                    newfld = merge_fields[field]
                    if (field in final_df.columns
                            and newfld in final_df.columns):
                        final_df[newfld] = np.where(final_df[newfld],
                                                    final_df[newfld],
                                                    final_df[field])
                    elif (field in final_df.columns
                          and newfld not in final_df.columns):
                        final_df = final_df.rename(columns={field: newfld})

            # Because of how coalescing works, we can have multiple duplicated
            # entries with same timestamp. Remove them
            dupts_keys = key_fields + ['timestamp']
            final_df = final_df.set_index(dupts_keys) \
                               .query('~index.duplicated(keep="last")') \
                               .reset_index()
            if (not final_df.empty and (view == 'latest')
                    and all(x in final_df.columns for x in key_fields)):
                final_df = final_df.set_index(key_fields) \
                                   .query('~index.duplicated(keep="last")')
        except (pa.lib.ArrowInvalid, OSError):
            return pd.DataFrame(columns=fields)

        if need_sqvers:
            final_df['sqvers'] = max_vers
            fields.insert(0, 'sqvers')

        cols = set(final_df.columns.tolist() + final_df.index.names)
        fields = [x for x in fields if x in cols]
        return final_df.reset_index()[fields]

Esempio n. 21

0

Mostra file

def read_parquet(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    row_groups=None,
    skip_rows=None,
    num_rows=None,
    strings_to_categorical=False,
    use_pandas_metadata=True,
    *args,
    **kwargs,
):
    """{docstring}"""

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # a list of row groups per source should be passed. make the list of
    # lists that is expected for multiple sources
    if row_groups is not None:
        if not is_list_like(row_groups):
            row_groups = [[row_groups]]
        elif not is_list_like(row_groups[0]):
            row_groups = [row_groups]

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source, compression=None, **kwargs)
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported")
        filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        # Convert filters to ds.Expression
        filters = pq._filters_to_expression(filters)

        # Initialize ds.FilesystemDataset
        dataset = ds.dataset(filepaths_or_buffers,
                             format="parquet",
                             partitioning="hive")

        # Load IDs of filtered row groups for each file in dataset
        filtered_rg_ids = defaultdict(list)
        for fragment in dataset.get_fragments(filter=filters):
            for rg_fragment in fragment.get_row_group_fragments(filters):
                for rg_id in rg_fragment.row_groups:
                    filtered_rg_ids[rg_fragment.path].append(rg_id)

        # TODO: Use this with pyarrow 1.0.0
        # # Load IDs of filtered row groups for each file in dataset
        # filtered_row_group_ids = {}
        # for fragment in dataset.get_fragments(filters):
        #     for row_group_fragment in fragment.split_by_row_group(filters):
        #         for row_group_info in row_group_fragment.row_groups:
        #             path = row_group_fragment.path
        #             if path not in filtered_row_group_ids:
        #                 filtered_row_group_ids[path] = [row_group_info.id]
        #             else:
        #                 filtered_row_group_ids[path].append(row_group_info.id)

        # Initialize row_groups to be selected
        if row_groups is None:
            row_groups = [None for _ in dataset.files]

        # Store IDs of selected row groups for each file
        for i, file in enumerate(dataset.files):
            if row_groups[i] is None:
                row_groups[i] = filtered_rg_ids[file]
            else:
                row_groups[i] = filter(lambda id: id in row_groups[i],
                                       filtered_rg_ids[file])

    if engine == "cudf":
        return libparquet.read_parquet(
            filepaths_or_buffers,
            columns=columns,
            row_groups=row_groups,
            skip_rows=skip_rows,
            num_rows=num_rows,
            strings_to_categorical=strings_to_categorical,
            use_pandas_metadata=use_pandas_metadata,
        )
    else:
        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
        return cudf.DataFrame.from_arrow(
            pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                columns=columns, *args, **kwargs))

Esempio n. 22

0

Mostra file

    def _get_cp_dataset(self, table_name: str, need_sqvers: bool, sqvers: str,
                        view: str, start_time: float,
                        end_time: float) -> ds.dataset:
        """Get the list of files to read in coalesced dir

        This iterates over the coalesced files that need to be read and comes
        up with a list of files that corresponds to the timeslot the user has
        specified

        :param table_name: str, Table for which coalesced info is requested
        :param need_sqvers: bool, True if the user has requested that we
                            return the sqvers
        :param sqvers: str, if we're looking only for files of a specific vers
        :param view: str, whether to return the latest only OR all
        :param start_time: float, the starting time window of data needed
        : param end_time: float, the ending time window of data needed
        :returns: pyarrow dataset for the files to be read
        :rtype: pyarrow.dataset.dataset

        """

        filelist = []
        max_vers = 0

        folder = self._get_table_directory(table_name, True)

        if start_time and end_time or (view == "all"):
            # Enforcing the logic we have: if both start_time & end_time
            # are given, return all files since the model is that the user is
            # expecting to see all changes in the time window. Otherwise, the user
            # is expecting to see only the latest before an end_time OR after a
            # start_time.
            all_files = True
        else:
            all_files = False

        # We need to iterate otherwise the differing schema from different dirs
        # causes the read to abort.
        dirs = Path(folder)
        if not dirs.exists() or not dirs.is_dir():
            return

        for elem in dirs.iterdir():
            # Additional processing around sqvers filtering and data
            if 'sqvers=' not in str(elem):
                continue
            if sqvers and f'sqvers={sqvers}' != elem.name:
                continue
            elif need_sqvers:
                vers = float(str(elem).split('=')[-1])
                if vers > max_vers:
                    max_vers = vers

            dataset = ds.dataset(elem, format='parquet', partitioning='hive')
            if not start_time and not end_time:
                files = dataset.files
            else:
                files = []
                latest_filedict = {}
                prev_time = 0
                prev_namespace = ''
                file_in_this_ns = False
                prev_file = None
                thistime = []
                for file in sorted(dataset.files):
                    namespace = os.path.dirname(file).split('namespace=')[1] \
                        .split('/')[0]
                    if (prev_namespace and (namespace != prev_namespace)
                            and thistime and not file_in_this_ns):
                        if ((start_time and thistime[1] >= start_time)
                                or (end_time and thistime[1] >= end_time)):
                            files.append(prev_file)
                            prev_namespace = ''
                    thistime = os.path.basename(file).split('.')[0] \
                        .split('-')[-2:]
                    thistime = [int(x) * 1000 for x in thistime]  # time in ms
                    if not start_time or (thistime[0] >= start_time):
                        if not end_time:
                            files.append(file)
                            file_in_this_ns = True
                        elif thistime[0] < end_time:
                            files.append(file)
                            file_in_this_ns = True
                        elif prev_time < end_time < thistime[0]:
                            key = file.split('namespace=')[1].split('/')[0]
                            if key not in latest_filedict:
                                latest_filedict[key] = file
                                file_in_this_ns = True

                    prev_time = thistime[0]
                    prev_file = file
                    prev_namespace = namespace
                if not file_in_this_ns:
                    if (thistime
                            and ((start_time and thistime[1] >= start_time) or
                                 (end_time and thistime[1] >= end_time))):
                        files.append(file)

                if latest_filedict:
                    filelist.extend(list(latest_filedict.values()))
            if not all_files and files:
                latest_filedict = {
                    x.split('namespace=')[1].split('/')[0]: x
                    for x in sorted(files)
                }
                filelist.extend(list(latest_filedict.values()))
            elif files:
                filelist.extend(sorted(files))

        if filelist:
            return ds.dataset(filelist, format='parquet', partitioning='hive')
        else:
            return []

Esempio n. 23

0

Mostra file

File: test_dataset.py Progetto: UCHI-DB/arrow

def test_open_dataset_from_source_additional_kwargs(tempdir):
    _, path = _create_single_file(tempdir)
    with pytest.raises(ValueError, match="cannot pass any additional"):
        ds.dataset(ds.source(path), format="parquet")

Esempio n. 24

0

Mostra file

File: PyTorchLoader.py Progetto: KamWithK/PyParquetLoaders

 def __init__(self, path, process_func):
     super().__init__()
     self.dataset = ds.dataset(path)
     self.process_func = process_func

Esempio n. 25

0

Mostra file

def _read_map_parquet(healsparse_class,
                      filepath,
                      pixels=None,
                      header=False,
                      degrade_nside=None,
                      weightfile=None,
                      reduction='mean',
                      use_threads=False):
    """
    Internal function to read in a HealSparseMap from a parquet dataset.

    Parameters
    ----------
    healsparse_class : `type`
        Type value of the HealSparseMap class.
    filepath : `str`
        Name of the file path to read.  Must be a parquet dataset.
    pixels : `list`, optional
        List of coverage map pixels to read.
    header : `bool`, optional
        Return the parquet metadata as well as map?  Default is False.
    degrade_nside : `int`, optional
        Degrade map to this nside on read.  None means leave as-is.
        Not yet implemented for parquet.
    weightfile : `str`, optional
        Floating-point map to supply weights for degrade wmean.  Must
        be a HealSparseMap (weighted degrade not supported for
        healpix degrade-on-read).
        Not yet implemented for parquet.
    reduction : `str`, optional
        Reduction method with degrade-on-read.
        (mean, median, std, max, min, and, or, sum, prod, wmean).
        Not yet implemented for parquet.
    use_threads : `bool`, optional
        Use multithreaded reading.

    Returns
    -------
    healSparseMap : `HealSparseMap`
        HealSparseMap from file, covered by pixels
    header : `astropy.io.fits.Header` (if header=True)
        Header metadata for the map file.
    """
    ds = dataset.dataset(filepath, format='parquet', partitioning='hive')
    schema = ds.schema
    # Convert from byte strings
    md = {
        key.decode(): schema.metadata[key].decode()
        for key in schema.metadata
    }

    if 'healsparse::filetype' not in md:
        raise RuntimeError("Filepath %s is not a healsparse parquet map." %
                           (filepath))
    if md['healsparse::filetype'] != 'healsparse':
        raise RuntimeError("Filepath %s is not a healsparse parquet map." %
                           (filepath))
    cov_fname = os.path.join(filepath, '_coverage.parquet')
    if not os.path.isfile(cov_fname):
        # Note that this could be reconstructed from the information in the file
        # inefficiently.  This feature could be added in the future.
        raise RuntimeError("Filepath %s is missing coverage map %s" %
                           (filepath, cov_fname))

    nside_sparse = int(md['healsparse::nside_sparse'])
    nside_coverage = int(md['healsparse::nside_coverage'])
    nside_io = int(md['healsparse::nside_io'])
    bitshift_io = _compute_bitshift(nside_io, nside_coverage)

    cov_tab = parquet.read_table(cov_fname, use_threads=use_threads)
    cov_pixels = cov_tab['cov_pix'].to_numpy()
    row_groups = cov_tab['row_group'].to_numpy()

    if pixels is not None:
        _pixels = np.atleast_1d(pixels)
        if len(np.unique(_pixels)) < len(_pixels):
            raise RuntimeError("Input list of pixels must be unique.")

        sub = np.clip(np.searchsorted(cov_pixels, _pixels), 0,
                      cov_pixels.size - 1)
        ok, = np.where(cov_pixels[sub] == _pixels)
        if ok.size == 0:
            raise RuntimeError(
                "None of the specified pixels are in the coverage map.")
        _pixels = np.sort(_pixels[ok])

        _pixels_io = np.right_shift(_pixels, bitshift_io)

        # Figure out row groups...
        matches = np.searchsorted(cov_pixels, _pixels)
        _row_groups_io = row_groups[matches]
    else:
        _pixels = cov_pixels
        _pixels_io = None
        _row_groups_io = None

    cov_map = HealSparseCoverage.make_from_pixels(nside_coverage, nside_sparse,
                                                  _pixels)

    if md['healsparse::widemask'] == 'True':
        is_wide_mask = True
        wmult = int(md['healsparse::wwidth'])
    else:
        is_wide_mask = False
        wmult = 1

    if md['healsparse::primary'] != '':
        # This is a multi-column table.
        is_rec_array = True
        primary = md['healsparse::primary']
        columns = [
            name for name in schema.names if name not in ['iopix', 'cov_pix']
        ]
        dtype = [(name, schema.field(name).type.to_pandas_dtype())
                 for name in columns]
        primary_dtype = schema.field(primary).type.to_pandas_dtype()
    else:
        is_rec_array = False
        primary = None
        dtype = schema.field('sparse').type.to_pandas_dtype()
        primary_dtype = dtype
        columns = ['sparse']

    if md['healsparse::sentinel'] == 'UNSEEN':
        sentinel = primary_dtype(hpg.UNSEEN)
    elif md['healsparse::sentinel'] == 'False':
        sentinel = False
    elif md['healsparse::sentinel'] == 'True':
        sentinel = True
    else:
        sentinel = primary_dtype(md['healsparse::sentinel'])

        if is_integer_value(sentinel):
            sentinel = int(sentinel)
        elif not isinstance(sentinel, np.bool_):
            sentinel = float(sentinel)

    if is_rec_array:
        sparse_map = np.zeros((_pixels.size + 1) * cov_map.nfine_per_cov,
                              dtype=dtype)
        # Fill in the overflow (primary)
        sparse_map[primary][:cov_map.nfine_per_cov] = sentinel
        # Fill in the overflow (not primary)
        for d in dtype:
            if d[0] == primary:
                continue
            sparse_map[d[0]][:cov_map.nfine_per_cov] = check_sentinel(
                d[1], None)
    else:
        sparse_map = np.zeros(
            (_pixels.size + 1) * cov_map.nfine_per_cov * wmult, dtype=dtype)
        sparse_map[:cov_map.nfine_per_cov * wmult] = sentinel

    if _pixels_io is None:
        # Read the full table
        tab = ds.to_table(columns=columns, use_threads=use_threads)
    else:
        _pixels_io_unique = list(np.unique(_pixels_io))

        fragments = list(
            ds.get_fragments(
                filter=dataset.field('iopix').isin(_pixels_io_unique)))
        group_fragments = []
        for pixel_io, fragment in zip(_pixels_io_unique, fragments):
            groups = fragment.split_by_row_group()
            # Only append groups that are relevant
            use, = np.where(_pixels_io == pixel_io)
            for ind in use:
                group_fragments.append(groups[_row_groups_io[ind]])

        ds2 = dataset.FileSystemDataset(group_fragments, schema, ds.format)
        tab = ds2.to_table(columns=columns, use_threads=use_threads)

    if is_rec_array:
        for name in columns:
            sparse_map[name][cov_map.nfine_per_cov:] = tab[name].to_numpy()
    else:
        sparse_map[cov_map.nfine_per_cov * wmult:] = tab['sparse'].to_numpy()

        if is_wide_mask:
            sparse_map = sparse_map.reshape(
                (sparse_map.size // wmult, wmult)).astype(WIDE_MASK)

    healsparse_map = healsparse_class(cov_map=cov_map,
                                      sparse_map=sparse_map,
                                      nside_sparse=nside_sparse,
                                      primary=primary,
                                      sentinel=sentinel)

    if header:
        if 'healsparse::header' in md:
            hdr_string = md['healsparse::header']
            hdr = fits.Header.fromstring(hdr_string)
        else:
            hdr = fits.Header()

        return (healsparse_map, hdr)
    else:
        return healsparse_map

Esempio n. 26

0

Mostra file

def test_open_dataset_non_existing_file():
    # ARROW-8213: Opening a dataset with a local incorrect path gives confusing
    #             error message
    with pytest.raises(FileNotFoundError):
        ds.dataset('i-am-not-existing.parquet', format='parquet')

Esempio n. 27

0

Mostra file

File: scdata.py Progetto: das-projects/singlecell

 def update_scdata(self, scdata):  # TODO: Only accepts one file for now
     ds.write_dataset(scdata, self.data_interim, format="arrow")
     dataset = ds.dataset(self.data_interim, format="arrow")
     self.filetype = "arrow"
     self.memory_mapped_dataset = pa.memory_map(dataset.files[0], 'r')

Esempio n. 28

0

Mostra file

import pyarrow.feather as fe
import datamol as dm
import operator

dm.disable_rdkit_log()

dataset = [
    '/data/dockop_glide_d3/second50k_glide_molchunkout/second50k_glide_out.molchunk',
    '/data/dockop_glide_d3/first50k_glide_molchunkout',
    '/data/dockop_glide_d3/thirdd50k_glide_molchunkout/third50k_glide_out.molchunk',
    '/data/dockop_glide_d3/fourth50k_glide_molchunkout/fourth50k_glide_out.molchunk',
    '/data/dockop_glide_d3/fithround_glide_molchunkout/fifth50k_glide_out.molchunk'
]
dflist = []
for data in dataset:
    dataset = ds.dataset(data, format="feather")
    df = dataset.to_table().to_pandas()
    dflist.append(df)


def combine_unique_molchunks_with_identical_columns(molchunk_1, molchunk_2):
    outer_merged = pd.merge(molchunk_1, molchunk_2, how='outer')
    return outer_merged


docked_df = combine_unique_molchunks_with_identical_columns(
    dflist[0], dflist[1])
docked_df = combine_unique_molchunks_with_identical_columns(
    docked_df, dflist[2])
docked_df = combine_unique_molchunks_with_identical_columns(
    docked_df, dflist[3])

Esempio n. 29

0

Mostra file

    def from_parquet(
        cls,
        file,
        treepath="/Events",
        entry_start=None,
        entry_stop=None,
        runtime_cache=None,
        persistent_cache=None,
        schemaclass=NanoAODSchema,
        metadata=None,
        parquet_options={},
        rados_parquet_options={},
        access_log=None,
    ):
        """Quickly build NanoEvents from a parquet file

        Parameters
        ----------
            file : str, pathlib.Path, pyarrow.NativeFile, or python file-like
                The filename or already opened file using e.g. ``uproot.open()``
            treepath : str, optional
                Name of the tree to read in the file
            entry_start : int, optional
                Start at this entry offset in the tree (default 0)
            entry_stop : int, optional
                Stop at this entry offset in the tree (default end of tree)
            runtime_cache : dict, optional
                A dict-like interface to a cache object. This cache is expected to last the
                duration of the program only, and will be used to hold references to materialized
                awkward arrays, etc.
            persistent_cache : dict, optional
                A dict-like interface to a cache object. Only bare numpy arrays will be placed in this cache,
                using globally-unique keys.
            schemaclass : BaseSchema
                A schema class deriving from `BaseSchema` and implementing the desired view of the file
            metadata : dict, optional
                Arbitrary metadata to add to the `base.NanoEvents` object
            parquet_options : dict, optional
                Any options to pass to ``pyarrow.parquet.ParquetFile``
            access_log : list, optional
                Pass a list instance to record which branches were lazily accessed by this instance
        """
        import pyarrow
        import pyarrow.dataset as ds
        import pyarrow.parquet

        ftypes = (
            pathlib.Path,
            pyarrow.NativeFile,
            io.TextIOBase,
            io.BufferedIOBase,
            io.RawIOBase,
            io.IOBase,
        )

        if isinstance(file, ftypes):
            table_file = pyarrow.parquet.ParquetFile(file, **parquet_options)
        elif isinstance(file, str):
            fs_file = fsspec.open(file, "rb")
            table_file = pyarrow.parquet.ParquetFile(fs_file,
                                                     **parquet_options)
        elif isinstance(file, pyarrow.parquet.ParquetFile):
            table_file = file
        else:
            raise TypeError("Invalid file type (%s)" % (str(type(file))))

        if entry_start is None or entry_start < 0:
            entry_start = 0
        if entry_stop is None or entry_stop > table_file.metadata.num_rows:
            entry_stop = table_file.metadata.num_rows

        pqmeta = table_file.schema_arrow.metadata
        pquuid = None if pqmeta is None else pqmeta.get(b"uuid", None)
        pqobj_path = None if pqmeta is None else pqmeta.get(
            b"object_path", None)

        partition_key = (
            str(None) if pquuid is None else pquuid.decode("ascii"),
            str(None) if pqobj_path is None else pqobj_path.decode("ascii"),
            "{0}-{1}".format(entry_start, entry_stop),
        )
        uuidpfn = {partition_key[0]: pqobj_path}
        mapping = ParquetSourceMapping(TrivialParquetOpener(
            uuidpfn, parquet_options),
                                       access_log=access_log)

        format_ = "parquet"
        if "ceph_config_path" in rados_parquet_options:
            format_ = ds.RadosParquetFileFormat(
                rados_parquet_options["ceph_config_path"].encode())

        dataset = ds.dataset(file,
                             schema=table_file.schema_arrow,
                             format=format_)

        shim = TrivialParquetOpener.UprootLikeShim(file, dataset)
        mapping.preload_column_source(partition_key[0], partition_key[1], shim)

        base_form = mapping._extract_base_form(table_file.schema_arrow)

        return cls._from_mapping(
            mapping,
            partition_key,
            base_form,
            runtime_cache,
            persistent_cache,
            schemaclass,
            metadata,
        )

Esempio n. 30

0

Mostra file

File: test_dataset.py Progetto: xiepeini/arrow

def test_open_union_dataset(tempdir):
    _, path = _create_single_file(tempdir)
    dataset = ds.dataset(path)

    union = ds.dataset([dataset, dataset])
    assert isinstance(union, ds.UnionDataset)