Exemple #1
0
def apply(path, parameters=None):
    """
    Import a Parquet file

    Parameters
    -------------
    path
        Path of the file to import
    parameters
        Parameters of the algorithm, possible values:
            Parameters.COLUMNS -> columns to import from the Parquet file

    Returns
    -------------
    df
        Pandas dataframe
    """
    if parameters is None:
        parameters = {}

    columns = exec_utils.get_param_value(Parameters.COLUMNS, parameters, None)

    if columns:
        df = pq.read_pandas(path, columns=columns).to_pandas()
    else:
        df = pq.read_pandas(path).to_pandas()

    return dataframe_utils.legacy_parquet_support(df)
Exemple #2
0
def apply(path, parameters=None):
    """
    Import a Parquet file

    Parameters
    -------------
    path
        Path of the file to import
    parameters
        Parameters of the algorithm, possible values:
            columns -> columns to import from the Parquet file

    Returns
    -------------
    df
        Pandas dataframe
    """
    if parameters is None:
        parameters = {}

    columns = parameters[COLUMNS] if COLUMNS in parameters else None

    if columns:
        columns = [x.replace(":", "AAA") for x in columns]
        df = pq.read_pandas(path, columns=columns).to_pandas()
    else:
        df = pq.read_pandas(path, columns=columns).to_pandas()
    df.columns = [x.replace("AAA", ":") for x in df.columns]

    return df
    def __getitem__(self, index):
        if isinstance(index, int):
            col_to_load = str(self.metadata['signal_id'].loc[index])
            signal = pq.read_pandas(self.filename, columns=[col_to_load]).to_pandas()
            signal = np.array(signal).reshape(-1).astype(np.float32)
        else:
            if isinstance(index, slice):
                start = index.start if index.start is not None else 0
                stop = index.stop if index.stop is not None else len(self)
                print(stop)
                step = index.step if index.step is not None else 1
                if (start < 0) or (stop < 0) or (step < 0):
                    raise ValueError('start and stop and step must be not minus')
                indices = list(range(start, stop, step))
            elif isinstance(index, list):
                indices = index
            elif isinstance(index, np.ndarray):
                indices = index.tolist()
            else:
                raise ValueError('index must be int ,slice, list or np.ndarray')
            col_to_load = self.metadata['signal_id'].loc[indices]
            col_to_load = list(map(str, list(col_to_load)))
            #print(col_to_load)
            signal = pq.read_pandas(self.filename, columns=col_to_load).to_pandas().astype(np.float32)
            signal = signal.values.T

            
        return signal
Exemple #4
0
def _download_data(key_prefix, s3, bucket_name, prefix, sep, skip_empty_files=True,
                   first_row_columns=True):
    if first_row_columns:
        header_setting = 'infer'
    else:
        header_setting = None
    df_list = []
    if prefix is False:
        file = s3.Object(bucket_name, key_prefix)
        try:
            data = StringIO(str(file.get()['Body'].read(), 'utf-8'))
        except UnicodeDecodeError:
            data = BytesIO(file.get()['Body'].read())
        except ClientError:
            print('File not found on s3')
            return pd.DataFrame([])
        try:
            df_list.append(pd.read_csv(data, error_bad_lines=False, warn_bad_lines=False, sep=sep,
                                       header=header_setting))
        except UnicodeDecodeError:
            df_list.append(pq.read_pandas(data).to_pandas())
        except EmptyDataError:
            print('File is empty')
            return pd.DataFrame([])

    else:
        bucket = s3.Bucket(bucket_name)

        try:
            for file in bucket.objects.filter(Prefix=key_prefix):
                if 'SUCCESS' not in file.key:
                    tmp = StringIO(str(file.get()['Body'].read(), 'utf-8'))
                    try:
                        data = pd.read_csv(tmp, error_bad_lines=False, warn_bad_lines=False, sep=sep,
                                           header=header_setting)
                        df_list.append(data)
                    except EmptyDataError:
                        if skip_empty_files is False:
                            print('Encountered an empty file: ', file.key)
                            return pd.DataFrame([])

        except UnicodeDecodeError:
            for file in bucket.objects.filter(Prefix=key_prefix):
                if 'SUCCESS' not in file.key:
                    data = BytesIO(file.get()['Body'].read())
                    df_list.append(pq.read_pandas(data).to_pandas())
        except ClientError:
            print('File not found on s3')
            return pd.DataFrame([])

    if not df_list:
        print('No matching file found')
        return pd.DataFrame([])
    data = pd.concat(df_list)
    return data
Exemple #5
0
def import_parquet_file(path, parameters=None):
    """
    Import a Parquet file

    Parameters
    -------------
    path
        Path of the file to import
    parameters
        Parameters of the algorithm, possible values:
            columns -> columns to import from the Parquet file

    Returns
    -------------
    df
        Pandas dataframe
    """
    if parameters is None:
        parameters = {}

    columns = parameters[COLUMNS] if COLUMNS in parameters else None
    case_id_glue = parameters[
        pm4py_constants.
        PARAMETER_CONSTANT_CASEID_KEY] if pm4py_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else "case:concept:name"
    timestamp_key = parameters[
        pm4py_constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4py_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else "time:timestamp"

    if path.startswith("s3dir:///"):
        path = "/" + path.split("s3dir:///")[1]
        all_files = get_list_parquets_from_s3(path)
        dataframes = []
        for f in all_files:
            dataframes.append(import_parquet_file(f))
        df = pd.concat(dataframes)
        df[timestamp_key] = pd.to_datetime(df[timestamp_key], utc=True)
        df = df.sort_values([case_id_glue, timestamp_key])
        return df
    elif path.startswith("s3:///"):
        path = get_parquet_from_s3(path)

    if columns:
        if enable_col_replacement:
            columns = [x.replace(":", "AAA") for x in columns]
        df = pq.read_pandas(path, columns=columns).to_pandas()
    else:
        df = pq.read_pandas(path, columns=columns).to_pandas()
    if enable_col_replacement:
        df.columns = [x.replace("AAA", ":") for x in df.columns]

    return df
Exemple #6
0
    def __init__(self,
                 root,
                 transform=None,
                 target_transform=None,
                 _type='test'):
        if _type == 'train':
            # 25148 segmentation fault (core dumped)
            # conda install pyarrow rather than pip.
            train = pd.read_csv(os.path.join(root, 'train.csv'))
            data0 = pq.read_pandas(
                os.path.join(root, 'train_image_data_0.parquet')).to_pandas()
            data1 = pq.read_pandas(
                os.path.join(root, 'train_image_data_1.parquet')).to_pandas()
            data2 = pq.read_pandas(
                os.path.join(root, 'train_image_data_2.parquet')).to_pandas()
            data3 = pq.read_pandas(
                os.path.join(root, 'train_image_data_3.parquet')).to_pandas()
            data_full = pd.concat([data0, data1, data2, data3],
                                  ignore_index=True)
        else:
            test = pd.read_csv(os.path.join(root, 'test.csv'))
            data0 = pq.read_pandas(
                os.path.join(root, 'test_image_data_0.parquet')).to_pandas()
            data1 = pq.read_pandas(
                os.path.join(root, 'test_image_data_1.parquet')).to_pandas()
            data2 = pq.read_pandas(
                os.path.join(root, 'test_image_data_2.parquet')).to_pandas()
            data3 = pq.read_pandas(
                os.path.join(root, 'test_image_data_3.parquet')).to_pandas()
            data_full = pd.concat([data0, data1, data2, data3],
                                  ignore_index=True)

        ipdb.set_trace()
        self.df = None
        self.label = None
Exemple #7
0
def prep_data(start, end):
    praq_train = pq.read_pandas(data_dir + 'train.parquet',
                                columns=[str(i) for i in range(start, end)
                                         ]).to_pandas()

    X = []
    y = []

    for id_measurement in tqdm(
            df_train.index.levels[0].unique()[int(start / 3):int(end / 3)]):
        x_signal = []
        # for each phase of the signal
        for phase in [0, 1, 2]:

            # extract from df_train both signal_id and target to compose the new data sets
            signal_id, target = df_train.loc[id_measurement].loc[phase]

            # but just append the target one time, to not triplicate it
            if phase == 0:
                y.append(target)
            x_signal.append(transform_ts(praq_train[str(signal_id)]))

        # concatenate all the 3 phases in one matrix
        x_signal = np.concatenate(x_signal, axis=1)
        X.append(x_signal)

    X = np.asarray(X)
    y = np.asarray(y)
    return X, y
Exemple #8
0
def transform_test():

     # First we daclarete a series of parameters to initiate the loading of the main data
    # it is too large, it is impossible to load in one time, so we are doing it in dividing in 10 parts
    first_sig = meta_test.index[0]
    n_parts = 10
    max_line = len(meta_test)
    part_size = int(max_line / n_parts)
    last_part = max_line % n_parts
    print(first_sig, n_parts, max_line, part_size, last_part, n_parts * part_size + last_part)
    # Here we create a list of lists with start index and end index for each of the 10 parts and one for the last partial part
    start_end = [[x, x+part_size] for x in range(first_sig, max_line + first_sig, part_size)]
    start_end = start_end[:-1] + [[start_end[-1][0], start_end[-1][0] + last_part]]
    print(start_end)

    X_test = []
    # now, very like we did above with the train data, we convert the test data part by part
    # transforming the 3 phases 800000 measurement in matrix (160,57)
    for start, end in start_end:
        subset_test = pq.read_pandas('../input/test.parquet', columns=[str(i) for i in range(start, end)]).to_pandas()
        for i in tqdm(subset_test.columns):
            id_measurement, phase = meta_test.loc[int(i)]
            subset_test_col = subset_test[i]
            subset_trans = transform_ts(subset_test_col)
            X_test.append([i, id_measurement, phase, subset_trans])

    X_test_input = np.asarray([np.concatenate([X_test[i][3],X_test[i+1][3], X_test[i+2][3]], axis=1) for i in range(0,len(X_test), 3)])
    np.save("../input/X_test.npy",X_test_input)

    return X_test_input
def get_measurement(ID):
    columns = [str(i) for i in range(ID * 3, ID * 3 + 3, 1)]
    measurement = pq.read_pandas(data_path + 'train.parquet',
                                 columns).to_pandas()
    labels = meta_train['target'].iloc[columns]

    return ID, measurement, labels
Exemple #10
0
def dump_dataset():
    filelist = sorted(glob.glob(f"{config['tracking']}/clouds_*.pq"))

    for t, f in enumerate(filelist):
        print(f'\t {t}/{len(filelist)} ({t/len(filelist)*100:.1f} %)',
              end='\r')

        # Read to Pandas Dataframe and process
        df = pq.read_pandas(f, nthreads=16).to_pandas()

        # Translate indices to coordinates
        df['z'] = df.coord // (c.nx * c.ny)
        xy = df.coord % (c.nx * c.ny)
        df['y'] = xy // c.ny
        df['x'] = xy % c.nx

        # Take cloud regions and trim noise
        df = df[df.type == 0]

        def calc_fractality(df):
            a_, p_ = calculate_parameters(df)
            return pd.DataFrame({'a': [a_], 'p': [p_]})

        group = df.groupby(['cid'], as_index=False)
        with Parallel(n_jobs=16) as Pr:
            result = Pr(
                delayed(calc_fractality)(grouped) for _, grouped in group)
            df = pd.concat(result, ignore_index=True)
            df.to_parquet(f'../pq/fdim_hres_ap_dump_{t:03d}.pq')
Exemple #11
0
def compute_constant_auc():
    data_root = '/raid/data/kaggle/ieee-fraud-detection'
    train_trans = pq.read_pandas(data_root +
                                 '/train_transaction.parquet').to_pandas()
    train_trans['preds'] = 1.0
    auc = roc_auc_score(train_trans.isFraud, train_trans.preds)
    print('Auc:', auc)
Exemple #12
0
def prep_data(df, start, end, n_dim, min_num, max_num, sample_size):
    # this function take a piece of data and convert using transform_ts(), but it does to each of the 3 phases
    # if we would try to do in one time, could exceed the RAM Memmory
    # load a piece of data from file
    praq_train = pq.read_pandas(TRAIN_DATA,
                                columns=[str(i) for i in range(start, end)
                                         ]).to_pandas()
    X = []
    y = []
    # using tdqm to evaluate processing time
    # takes each index from df_train and iteract it from start to end
    # it is divided by 3 because for each id_measurement there are 3 id_signal, and the start/end parameters are id_signal
    for id_measurement in tqdm(
            df.index.levels[0].unique()[int(start / 3):int(end / 3)]):
        X_signal = []
        # for each phase of the signal
        for phase in [0, 1, 2]:
            signal_id, target = df.loc[id_measurement].loc[phase]
            # but just append the target one time, to not triplicate it
            if phase == 0:
                y.append(target)
            # extract and transform data into sets of features
            X_signal.append(
                transform_ts(praq_train[str(signal_id)],
                             sample_size=sample_size,
                             n_dim=n_dim,
                             min_num=min_num,
                             max_num=max_num))
        X_signal = np.concatenate(X_signal, axis=1)
        X.append(X_signal)
    X = np.asarray(X)
    y = np.asarray(y)
    return X, y
def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(
    tempdir, use_legacy_dataset
):
    df = alltypes_sample(size=10000)

    filename = tempdir / 'pandas_roundtrip.parquet'
    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
    js = arrow_table.schema.pandas_metadata
    assert not js['index_columns']
    # ARROW-2170
    # While index_columns should be empty, columns needs to be filled still.
    assert js['columns']

    _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms')
    table_read = pq.read_pandas(
        filename, use_legacy_dataset=use_legacy_dataset)

    js = table_read.schema.pandas_metadata
    assert not js['index_columns']

    read_metadata = table_read.schema.metadata
    assert arrow_table.schema.metadata == read_metadata

    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)
Exemple #14
0
def plot_sawtooth_example(measid, px, w):
    
    display(HTML(meta_train_df[meta_train_df['id_measurement'] == measid].to_html()))
    
    sigids = [measid * 3 + i for i in range(3)]
    
    fig, ax = plt.subplots(1, 1, figsize=(12, 4))

    for i, sigid in enumerate(sigids):
        d = pq.read_pandas(
            data_dir + '/train.parquet',
            columns=[str(sigid)]
        ).to_pandas().values[:, 0].astype(np.float)

        d = flatiron(d)
        d = d[px-w:px+w+1]
        plt.plot(d, marker='o', label='{}) {}'.format(i, sigid))

        if i == 1:
            s = d[w]
            ft = create_sawtooth_template(3, w, w) * s
            plt.plot(ft, color='black', ls='--')

    plt.axvline(w, color='black', ls='--')
    plt.legend()
    plt.show()
Exemple #15
0
def prep_data(start, end):
    # load a piece of data from file
    praq_train = pq.read_pandas('../input/train.parquet',
                                columns=[str(i) for i in range(start, end)
                                         ]).to_pandas()
    X = []
    y = []
    # using tdqm to evaluate processing time
    # takes each index from df_train and iteract it from start to end
    # it is divided by 3 because for each id_measurement there are 3 id_signal, and the start/end parameters are id_signal
    for id_measurement in tqdm(
            df_train.index.levels[0].unique()[int(start / 3):int(end / 3)]):
        X_signal = []
        # for each phase of the signal
        for phase in [0, 1, 2]:
            # extract from df_train both signal_id and target to compose the new data sets
            signal_id, target = df_train.loc[id_measurement].loc[phase]
            # but just append the target one time, to not triplicate it
            if phase == 0:
                y.append(target)
            # extract and transform data into sets of features
            X_signal.append(transform_ts(praq_train[str(signal_id)]))
        # concatenate all the 3 phases in one matrix
        X_signal = np.concatenate(X_signal, axis=1)
        # add the data to X
        X.append(X_signal)
    X = np.asarray(X)
    y = np.asarray(y)
    return X, y
Exemple #16
0
def read_parquet(path, *args, **kwargs):
    """{docstring}"""

    warnings.warn("Using CPU via PyArrow to read Parquet dataset, this will "
                  "be GPU accelerated in the future")
    pa_table = pq.read_pandas(path, *args, **kwargs)
    return DataFrame.from_arrow(pa_table)
Exemple #17
0
def test_pandas_parquet_datetime_tz():
    import pyarrow.parquet as pq

    s = pd.Series([datetime.datetime(2017, 9, 6)])
    s = s.dt.tz_localize('utc')

    s.index = s

    # Both a column and an index to hit both use cases
    df = pd.DataFrame(
        {
            'tz_aware': s,
            'tz_eastern': s.dt.tz_convert('US/Eastern')
        }, index=s)

    f = BytesIO()

    arrow_table = pa.Table.from_pandas(df)

    _write_table(arrow_table, f, coerce_timestamps='ms')
    f.seek(0)

    table_read = pq.read_pandas(f)

    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)
def read_parquet():
    start_time = time.time()
    train_df = pq.read_pandas(path / 'train.parquet').to_pandas()
    end_time = time.time()
    print(f'loading took {end_time-start_time} secs')
    train_df = train_df.T
    print(f'train_df.shape: {train_df.shape}')

    start_time = time.time()
    test_df = pq.read_pandas(path / 'test.parquet').to_pandas()
    end_time = time.time()
    print(f'loading took {end_time-start_time} secs')
    test_df = test_df.T
    print(f'test_df.shape: {test_df.shape}')

    return train_df, test_df
def prep_data(start, end):
    praq_train = pq.read_pandas(filepath + '/train.parquet',
                                columns=[str(i) for i in range(start, end)
                                         ]).to_pandas()
    X = []
    y = []
    # using tdqm to evaluate processing time
    for id_measurement in tqdm(
            y_train.index.levels[0].unique()[int(start / 3):int(end / 3)]):

        # for each phase of the signal
        for phase in [0, 1, 2]:
            X_signal = []
            signal_id, target = y_train.loc[id_measurement].loc[phase]
            y.append(target)
            X_signal.append(transform_ts(praq_train[str(signal_id)]))
            X_signal = np.concatenate(X_signal, axis=1)
            X.append(X_signal)
        # concatenate all the 3 phases in one matrix
        # X_signal = np.concatenate(X_signal, axis=1)
        # add the data to X
        # X.append(X_signal)
    X = np.asarray(X)
    y = np.asarray(y)
    return X, y
Exemple #20
0
def _load_base_features_src(exp_ids, test, series_df, meta_df, logger):
    target_ids = [
        'e001',
    ]
    if len(set(target_ids) & set(exp_ids)) < 1:
        sel_log(
            f'''
                ======== {__name__} ========
                Stop feature making because even 1 element in exp_ids
                    {exp_ids}
                does not in target_ids
                    {target_ids}''', logger)
        return None, None

    if test:
        series_path = './inputs/origin/test.parquet'
        meta_path = './inputs/origin/metadata_test.csv'
    else:
        series_path = './inputs/origin/train.parquet'
        meta_path = './inputs/origin/metadata_train.csv'

    # Load dfs if not input.
    if not series_df:
        sel_log(f'loading {series_path} ...', None)
        series_df = pq.read_pandas(series_path).to_pandas()
    if not meta_df:
        sel_log(f'loading {meta_path} ...', None)
        meta_df = pd.read_csv(meta_path)

    return series_df, meta_df
Exemple #21
0
def test_fastparquet_cross_compatibility(tempdir):
    fp = pytest.importorskip('fastparquet')

    df = pd.DataFrame({
        "a": list("abc"),
        "b": list(range(1, 4)),
        "c": np.arange(4.0, 7.0, dtype="float64"),
        "d": [True, False, True],
        "e": pd.date_range("20130101", periods=3),
        "f": pd.Categorical(["a", "b", "a"]),
        # fastparquet writes list as BYTE_ARRAY JSON, so no roundtrip
        # "g": [[1, 2], None, [1, 2, 3]],
    })
    table = pa.table(df)

    # Arrow -> fastparquet
    file_arrow = str(tempdir / "cross_compat_arrow.parquet")
    pq.write_table(table, file_arrow, compression=None)

    fp_file = fp.ParquetFile(file_arrow)
    df_fp = fp_file.to_pandas()
    tm.assert_frame_equal(df, df_fp)

    # Fastparquet -> arrow
    file_fastparquet = str(tempdir / "cross_compat_fastparquet.parquet")
    fp.write(file_fastparquet, df)

    table_fp = pq.read_pandas(file_fastparquet)
    # for fastparquet written file, categoricals comes back as strings
    # (no arrow schema in parquet metadata)
    df['f'] = df['f'].astype(object)
    tm.assert_frame_equal(table_fp.to_pandas(), df)
Exemple #22
0
def append_raw_to_parquet(df, full_path, limit_to_today=True):
    """Takes raw df and appends it to an existing parquet file. If the file does not exist, it is created."""
    df = polish_df(df, limit_to_today)
    try:
        df = pd.concat([pq.read_pandas(full_path).to_pandas(), df])
    except OSError:
        pass
    df.to_parquet(full_path)
Exemple #23
0
def _download_data(key_prefix, s3, bucket_name, prefix, sep):
    df_list = []
    if prefix is False:
        file = s3.Object(bucket_name, key_prefix)
        try:
            data = StringIO(str(file.get()['Body'].read(), 'utf-8'))
        except UnicodeDecodeError:
            data = BytesIO(file.get()['Body'].read())
        except ClientError:
            print('File not found on s3')
            return pd.DataFrame([])
        try:
            df_list.append(
                pd.read_csv(data,
                            error_bad_lines=False,
                            warn_bad_lines=False,
                            sep=sep))
        except UnicodeDecodeError:
            df_list.append(pq.read_pandas(data).to_pandas())

    else:
        bucket = s3.Bucket(bucket_name)

        try:
            for file in bucket.objects.filter(Prefix=key_prefix):
                if 'SUCCESS' not in file.key:
                    data = StringIO(str(file.get()['Body'].read(), 'utf-8'))
                    df_list.append(
                        pd.read_csv(data,
                                    error_bad_lines=False,
                                    warn_bad_lines=False,
                                    sep=sep))
        except UnicodeDecodeError:
            for file in bucket.objects.filter(Prefix=key_prefix):
                if 'SUCCESS' not in file.key:
                    data = BytesIO(file.get()['Body'].read())
                    df_list.append(pq.read_pandas(data).to_pandas())
        except ClientError:
            print('File not found on s3')
            return pd.DataFrame([])

    if not df_list:
        print('No matching file found')
        return pd.DataFrame([])
    data = pd.concat(df_list)
    return data
Exemple #24
0
def load_retrieved(folder):
    """Load data, project report, and workflow report."""
    names_to_ext = dict(x.split('.') for x in os.listdir(folder))
    names = ('data', 'proj_rep', 'workflow_rep')
    return tuple([
        pq.read_pandas(os.path.join(folder,
                                    f"{n}.{names_to_ext[n]}")).to_pandas()
        for n in names
    ])
Exemple #25
0
def to_array(file):
    tmp_config = {}
    tmp_config["file"] = os.path.abspath(TEMP_INPUT_FOLDER + "/" + file)
    tmp_config["output"] = os.path.abspath(OUTPUT_FOLDER + "/" + file +
                                           "_output.txt")
    data = pq.read_pandas(os.path.abspath(INPUT_FOLDER + "/" +
                                          file)).to_pandas()
    pd.np.savetxt(tmp_config["file"], data.T.values, delimiter=",", fmt="%s")
    tmp_config["length"] = data.values.shape[0]
    return tmp_config
Exemple #26
0
def test_read_pandas_column_subset(tmpdir):
    import pyarrow.parquet as pq

    df = _test_dataframe(10000)
    arrow_table = pa.Table.from_pandas(df)
    imos = pa.BufferOutputStream()
    _write_table(arrow_table, imos, version="2.0")
    buf = imos.get_result()
    reader = pa.BufferReader(buf)
    df_read = pq.read_pandas(reader, columns=['strings', 'uint8']).to_pandas()
    tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
Exemple #27
0
 def __init__(self):
     try:
         os.chdir(args.directory_load)
         if ".csv" in args.name:
             self.data = pd.read_csv(args.name)
         elif ".parquet" in args.name:
             self.data = pq.read_pandas(args.name).to_pandas()
         self.nalists = ["?", "na", "na", "null", "Null", "NULL", " "]
         self.output = dict()
     except:
         print("해당 파일이 존재하지 않습니다. 경로를 확인하세요.")
def test_read_pandas_column_subset(tmpdir):
    import pyarrow.parquet as pq

    df = _test_dataframe(10000)
    arrow_table = pa.Table.from_pandas(df)
    imos = pa.BufferOutputStream()
    _write_table(arrow_table, imos, version="2.0")
    buf = imos.get_result()
    reader = pa.BufferReader(buf)
    df_read = pq.read_pandas(reader, columns=['strings', 'uint8']).to_pandas()
    tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
Exemple #29
0
def readRawSignal_extractTestTarget(path, subset_size=50, start_id=8712, end_id=29049):
    relist = []

    processFun = lambda x: extractTestTarget(
                pq.read_pandas(
                    path,
                    columns=[str(val) for val in range(start_id+x*subset_size, min(start_id+(x+1)*subset_size, end_id))]).to_pandas()
                )
    multiProcess = mutiProcessLoop(processFun, range(math.ceil((end_id-start_id)/subset_size)), n_process=4, silence=False)
    resultlist = multiProcess.run()
    return pd.concat(resultlist)
Exemple #30
0
def test_read_pandas_passthrough_keywords(tempdir):
    # ARROW-11464 - previously not all keywords were passed through (such as
    # the filesystem keyword)
    df = pd.DataFrame({'a': [1, 2, 3]})

    filename = tempdir / 'data.parquet'
    _write_table(df, filename)

    result = pq.read_pandas('data.parquet',
                            filesystem=SubTreeFileSystem(
                                str(tempdir), LocalFileSystem()))
    assert result.equals(pa.table(df))
def test_read_pandas_column_subset(tempdir, use_legacy_dataset):
    df = _test_dataframe(10000)
    arrow_table = pa.Table.from_pandas(df)
    imos = pa.BufferOutputStream()
    _write_table(arrow_table, imos, version="2.0")
    buf = imos.getvalue()
    reader = pa.BufferReader(buf)
    df_read = pq.read_pandas(
        reader, columns=['strings', 'uint8'],
        use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
def test_pandas_parquet_2_0_rountrip(tmpdir):
    import pyarrow.parquet as pq
    df = alltypes_sample(size=10000)

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
    assert b'pandas' in arrow_table.schema.metadata

    _write_table(arrow_table, filename.strpath, version="2.0")
    table_read = pq.read_pandas(filename.strpath)
    assert b'pandas' in table_read.schema.metadata

    assert arrow_table.schema.metadata == table_read.schema.metadata

    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)
def test_pandas_parquet_column_multiindex(tmpdir):
    import pyarrow.parquet as pq

    df = alltypes_sample(size=10)
    df.columns = pd.MultiIndex.from_tuples(
        list(zip(df.columns, df.columns[::-1])),
        names=['level_1', 'level_2']
    )

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df)
    assert b'pandas' in arrow_table.schema.metadata

    _write_table(arrow_table, filename.strpath, version="2.0",
                 coerce_timestamps='ms')

    table_read = pq.read_pandas(filename.strpath)
    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)
def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tmpdir):
    import pyarrow.parquet as pq

    df = alltypes_sample(size=10000)

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
    js = json.loads(arrow_table.schema.metadata[b'pandas'].decode('utf8'))
    assert not js['index_columns']

    _write_table(arrow_table, filename.strpath, version="2.0",
                 coerce_timestamps='ms')
    table_read = pq.read_pandas(filename.strpath)

    js = json.loads(table_read.schema.metadata[b'pandas'].decode('utf8'))
    assert not js['index_columns']

    assert arrow_table.schema.metadata == table_read.schema.metadata

    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)
def test_pandas_parquet_datetime_tz():
    import pyarrow.parquet as pq

    s = pd.Series([datetime.datetime(2017, 9, 6)])
    s = s.dt.tz_localize('utc')

    s.index = s

    # Both a column and an index to hit both use cases
    df = pd.DataFrame({'tz_aware': s,
                       'tz_eastern': s.dt.tz_convert('US/Eastern')},
                      index=s)

    f = BytesIO()

    arrow_table = pa.Table.from_pandas(df)

    _write_table(arrow_table, f, coerce_timestamps='ms')
    f.seek(0)

    table_read = pq.read_pandas(f)

    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)
def test_read_multiple_files(tmpdir):
    import pyarrow.parquet as pq

    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)

        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        table = pa.Table.from_pandas(df)
        _write_table(table, path)

        test_data.append(table)
        paths.append(path)

    # Write a _SUCCESS.crc file
    with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f:
        f.write(b'0')

    def read_multiple_files(paths, columns=None, nthreads=None, **kwargs):
        dataset = pq.ParquetDataset(paths, **kwargs)
        return dataset.read(columns=columns, nthreads=nthreads)

    result = read_multiple_files(paths)
    expected = pa.concat_tables(test_data)

    assert result.equals(expected)

    with pytest.raises(NotImplementedError):
        pq.read_pandas(dirpath)

    # Read with provided metadata
    metadata = pq.ParquetFile(paths[0]).metadata

    result2 = read_multiple_files(paths, metadata=metadata)
    assert result2.equals(expected)

    result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema)
    assert result3.equals(expected)

    # Read column subset
    to_read = [result[0], result[2], result[6], result[result.num_columns - 1]]

    result = pa.localfs.read_parquet(
        dirpath, columns=[c.name for c in to_read])
    expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata)
    assert result.equals(expected)

    # Read with multiple threads
    pa.localfs.read_parquet(dirpath, nthreads=2)

    # Test failure modes with non-uniform metadata
    bad_apple = _test_dataframe(size, seed=i).iloc[:, :4]
    bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath

    t = pa.Table.from_pandas(bad_apple)
    _write_table(t, bad_apple_path)

    bad_meta = pq.ParquetFile(bad_apple_path).metadata

    with pytest.raises(ValueError):
        read_multiple_files(paths + [bad_apple_path])

    with pytest.raises(ValueError):
        read_multiple_files(paths, metadata=bad_meta)

    mixed_paths = [bad_apple_path, paths[0]]

    with pytest.raises(ValueError):
        read_multiple_files(mixed_paths, schema=bad_meta.schema)

    with pytest.raises(ValueError):
        read_multiple_files(mixed_paths)