def apply(path, parameters=None): """ Import a Parquet file Parameters ------------- path Path of the file to import parameters Parameters of the algorithm, possible values: Parameters.COLUMNS -> columns to import from the Parquet file Returns ------------- df Pandas dataframe """ if parameters is None: parameters = {} columns = exec_utils.get_param_value(Parameters.COLUMNS, parameters, None) if columns: df = pq.read_pandas(path, columns=columns).to_pandas() else: df = pq.read_pandas(path).to_pandas() return dataframe_utils.legacy_parquet_support(df)
def apply(path, parameters=None): """ Import a Parquet file Parameters ------------- path Path of the file to import parameters Parameters of the algorithm, possible values: columns -> columns to import from the Parquet file Returns ------------- df Pandas dataframe """ if parameters is None: parameters = {} columns = parameters[COLUMNS] if COLUMNS in parameters else None if columns: columns = [x.replace(":", "AAA") for x in columns] df = pq.read_pandas(path, columns=columns).to_pandas() else: df = pq.read_pandas(path, columns=columns).to_pandas() df.columns = [x.replace("AAA", ":") for x in df.columns] return df
def __getitem__(self, index): if isinstance(index, int): col_to_load = str(self.metadata['signal_id'].loc[index]) signal = pq.read_pandas(self.filename, columns=[col_to_load]).to_pandas() signal = np.array(signal).reshape(-1).astype(np.float32) else: if isinstance(index, slice): start = index.start if index.start is not None else 0 stop = index.stop if index.stop is not None else len(self) print(stop) step = index.step if index.step is not None else 1 if (start < 0) or (stop < 0) or (step < 0): raise ValueError('start and stop and step must be not minus') indices = list(range(start, stop, step)) elif isinstance(index, list): indices = index elif isinstance(index, np.ndarray): indices = index.tolist() else: raise ValueError('index must be int ,slice, list or np.ndarray') col_to_load = self.metadata['signal_id'].loc[indices] col_to_load = list(map(str, list(col_to_load))) #print(col_to_load) signal = pq.read_pandas(self.filename, columns=col_to_load).to_pandas().astype(np.float32) signal = signal.values.T return signal
def _download_data(key_prefix, s3, bucket_name, prefix, sep, skip_empty_files=True, first_row_columns=True): if first_row_columns: header_setting = 'infer' else: header_setting = None df_list = [] if prefix is False: file = s3.Object(bucket_name, key_prefix) try: data = StringIO(str(file.get()['Body'].read(), 'utf-8')) except UnicodeDecodeError: data = BytesIO(file.get()['Body'].read()) except ClientError: print('File not found on s3') return pd.DataFrame([]) try: df_list.append(pd.read_csv(data, error_bad_lines=False, warn_bad_lines=False, sep=sep, header=header_setting)) except UnicodeDecodeError: df_list.append(pq.read_pandas(data).to_pandas()) except EmptyDataError: print('File is empty') return pd.DataFrame([]) else: bucket = s3.Bucket(bucket_name) try: for file in bucket.objects.filter(Prefix=key_prefix): if 'SUCCESS' not in file.key: tmp = StringIO(str(file.get()['Body'].read(), 'utf-8')) try: data = pd.read_csv(tmp, error_bad_lines=False, warn_bad_lines=False, sep=sep, header=header_setting) df_list.append(data) except EmptyDataError: if skip_empty_files is False: print('Encountered an empty file: ', file.key) return pd.DataFrame([]) except UnicodeDecodeError: for file in bucket.objects.filter(Prefix=key_prefix): if 'SUCCESS' not in file.key: data = BytesIO(file.get()['Body'].read()) df_list.append(pq.read_pandas(data).to_pandas()) except ClientError: print('File not found on s3') return pd.DataFrame([]) if not df_list: print('No matching file found') return pd.DataFrame([]) data = pd.concat(df_list) return data
def import_parquet_file(path, parameters=None): """ Import a Parquet file Parameters ------------- path Path of the file to import parameters Parameters of the algorithm, possible values: columns -> columns to import from the Parquet file Returns ------------- df Pandas dataframe """ if parameters is None: parameters = {} columns = parameters[COLUMNS] if COLUMNS in parameters else None case_id_glue = parameters[ pm4py_constants. PARAMETER_CONSTANT_CASEID_KEY] if pm4py_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else "case:concept:name" timestamp_key = parameters[ pm4py_constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4py_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else "time:timestamp" if path.startswith("s3dir:///"): path = "/" + path.split("s3dir:///")[1] all_files = get_list_parquets_from_s3(path) dataframes = [] for f in all_files: dataframes.append(import_parquet_file(f)) df = pd.concat(dataframes) df[timestamp_key] = pd.to_datetime(df[timestamp_key], utc=True) df = df.sort_values([case_id_glue, timestamp_key]) return df elif path.startswith("s3:///"): path = get_parquet_from_s3(path) if columns: if enable_col_replacement: columns = [x.replace(":", "AAA") for x in columns] df = pq.read_pandas(path, columns=columns).to_pandas() else: df = pq.read_pandas(path, columns=columns).to_pandas() if enable_col_replacement: df.columns = [x.replace("AAA", ":") for x in df.columns] return df
def __init__(self, root, transform=None, target_transform=None, _type='test'): if _type == 'train': # 25148 segmentation fault (core dumped) # conda install pyarrow rather than pip. train = pd.read_csv(os.path.join(root, 'train.csv')) data0 = pq.read_pandas( os.path.join(root, 'train_image_data_0.parquet')).to_pandas() data1 = pq.read_pandas( os.path.join(root, 'train_image_data_1.parquet')).to_pandas() data2 = pq.read_pandas( os.path.join(root, 'train_image_data_2.parquet')).to_pandas() data3 = pq.read_pandas( os.path.join(root, 'train_image_data_3.parquet')).to_pandas() data_full = pd.concat([data0, data1, data2, data3], ignore_index=True) else: test = pd.read_csv(os.path.join(root, 'test.csv')) data0 = pq.read_pandas( os.path.join(root, 'test_image_data_0.parquet')).to_pandas() data1 = pq.read_pandas( os.path.join(root, 'test_image_data_1.parquet')).to_pandas() data2 = pq.read_pandas( os.path.join(root, 'test_image_data_2.parquet')).to_pandas() data3 = pq.read_pandas( os.path.join(root, 'test_image_data_3.parquet')).to_pandas() data_full = pd.concat([data0, data1, data2, data3], ignore_index=True) ipdb.set_trace() self.df = None self.label = None
def prep_data(start, end): praq_train = pq.read_pandas(data_dir + 'train.parquet', columns=[str(i) for i in range(start, end) ]).to_pandas() X = [] y = [] for id_measurement in tqdm( df_train.index.levels[0].unique()[int(start / 3):int(end / 3)]): x_signal = [] # for each phase of the signal for phase in [0, 1, 2]: # extract from df_train both signal_id and target to compose the new data sets signal_id, target = df_train.loc[id_measurement].loc[phase] # but just append the target one time, to not triplicate it if phase == 0: y.append(target) x_signal.append(transform_ts(praq_train[str(signal_id)])) # concatenate all the 3 phases in one matrix x_signal = np.concatenate(x_signal, axis=1) X.append(x_signal) X = np.asarray(X) y = np.asarray(y) return X, y
def transform_test(): # First we daclarete a series of parameters to initiate the loading of the main data # it is too large, it is impossible to load in one time, so we are doing it in dividing in 10 parts first_sig = meta_test.index[0] n_parts = 10 max_line = len(meta_test) part_size = int(max_line / n_parts) last_part = max_line % n_parts print(first_sig, n_parts, max_line, part_size, last_part, n_parts * part_size + last_part) # Here we create a list of lists with start index and end index for each of the 10 parts and one for the last partial part start_end = [[x, x+part_size] for x in range(first_sig, max_line + first_sig, part_size)] start_end = start_end[:-1] + [[start_end[-1][0], start_end[-1][0] + last_part]] print(start_end) X_test = [] # now, very like we did above with the train data, we convert the test data part by part # transforming the 3 phases 800000 measurement in matrix (160,57) for start, end in start_end: subset_test = pq.read_pandas('../input/test.parquet', columns=[str(i) for i in range(start, end)]).to_pandas() for i in tqdm(subset_test.columns): id_measurement, phase = meta_test.loc[int(i)] subset_test_col = subset_test[i] subset_trans = transform_ts(subset_test_col) X_test.append([i, id_measurement, phase, subset_trans]) X_test_input = np.asarray([np.concatenate([X_test[i][3],X_test[i+1][3], X_test[i+2][3]], axis=1) for i in range(0,len(X_test), 3)]) np.save("../input/X_test.npy",X_test_input) return X_test_input
def get_measurement(ID): columns = [str(i) for i in range(ID * 3, ID * 3 + 3, 1)] measurement = pq.read_pandas(data_path + 'train.parquet', columns).to_pandas() labels = meta_train['target'].iloc[columns] return ID, measurement, labels
def dump_dataset(): filelist = sorted(glob.glob(f"{config['tracking']}/clouds_*.pq")) for t, f in enumerate(filelist): print(f'\t {t}/{len(filelist)} ({t/len(filelist)*100:.1f} %)', end='\r') # Read to Pandas Dataframe and process df = pq.read_pandas(f, nthreads=16).to_pandas() # Translate indices to coordinates df['z'] = df.coord // (c.nx * c.ny) xy = df.coord % (c.nx * c.ny) df['y'] = xy // c.ny df['x'] = xy % c.nx # Take cloud regions and trim noise df = df[df.type == 0] def calc_fractality(df): a_, p_ = calculate_parameters(df) return pd.DataFrame({'a': [a_], 'p': [p_]}) group = df.groupby(['cid'], as_index=False) with Parallel(n_jobs=16) as Pr: result = Pr( delayed(calc_fractality)(grouped) for _, grouped in group) df = pd.concat(result, ignore_index=True) df.to_parquet(f'../pq/fdim_hres_ap_dump_{t:03d}.pq')
def compute_constant_auc(): data_root = '/raid/data/kaggle/ieee-fraud-detection' train_trans = pq.read_pandas(data_root + '/train_transaction.parquet').to_pandas() train_trans['preds'] = 1.0 auc = roc_auc_score(train_trans.isFraud, train_trans.preds) print('Auc:', auc)
def prep_data(df, start, end, n_dim, min_num, max_num, sample_size): # this function take a piece of data and convert using transform_ts(), but it does to each of the 3 phases # if we would try to do in one time, could exceed the RAM Memmory # load a piece of data from file praq_train = pq.read_pandas(TRAIN_DATA, columns=[str(i) for i in range(start, end) ]).to_pandas() X = [] y = [] # using tdqm to evaluate processing time # takes each index from df_train and iteract it from start to end # it is divided by 3 because for each id_measurement there are 3 id_signal, and the start/end parameters are id_signal for id_measurement in tqdm( df.index.levels[0].unique()[int(start / 3):int(end / 3)]): X_signal = [] # for each phase of the signal for phase in [0, 1, 2]: signal_id, target = df.loc[id_measurement].loc[phase] # but just append the target one time, to not triplicate it if phase == 0: y.append(target) # extract and transform data into sets of features X_signal.append( transform_ts(praq_train[str(signal_id)], sample_size=sample_size, n_dim=n_dim, min_num=min_num, max_num=max_num)) X_signal = np.concatenate(X_signal, axis=1) X.append(X_signal) X = np.asarray(X) y = np.asarray(y) return X, y
def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( tempdir, use_legacy_dataset ): df = alltypes_sample(size=10000) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, preserve_index=False) js = arrow_table.schema.pandas_metadata assert not js['index_columns'] # ARROW-2170 # While index_columns should be empty, columns needs to be filled still. assert js['columns'] _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms') table_read = pq.read_pandas( filename, use_legacy_dataset=use_legacy_dataset) js = table_read.schema.pandas_metadata assert not js['index_columns'] read_metadata = table_read.schema.metadata assert arrow_table.schema.metadata == read_metadata df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def plot_sawtooth_example(measid, px, w): display(HTML(meta_train_df[meta_train_df['id_measurement'] == measid].to_html())) sigids = [measid * 3 + i for i in range(3)] fig, ax = plt.subplots(1, 1, figsize=(12, 4)) for i, sigid in enumerate(sigids): d = pq.read_pandas( data_dir + '/train.parquet', columns=[str(sigid)] ).to_pandas().values[:, 0].astype(np.float) d = flatiron(d) d = d[px-w:px+w+1] plt.plot(d, marker='o', label='{}) {}'.format(i, sigid)) if i == 1: s = d[w] ft = create_sawtooth_template(3, w, w) * s plt.plot(ft, color='black', ls='--') plt.axvline(w, color='black', ls='--') plt.legend() plt.show()
def prep_data(start, end): # load a piece of data from file praq_train = pq.read_pandas('../input/train.parquet', columns=[str(i) for i in range(start, end) ]).to_pandas() X = [] y = [] # using tdqm to evaluate processing time # takes each index from df_train and iteract it from start to end # it is divided by 3 because for each id_measurement there are 3 id_signal, and the start/end parameters are id_signal for id_measurement in tqdm( df_train.index.levels[0].unique()[int(start / 3):int(end / 3)]): X_signal = [] # for each phase of the signal for phase in [0, 1, 2]: # extract from df_train both signal_id and target to compose the new data sets signal_id, target = df_train.loc[id_measurement].loc[phase] # but just append the target one time, to not triplicate it if phase == 0: y.append(target) # extract and transform data into sets of features X_signal.append(transform_ts(praq_train[str(signal_id)])) # concatenate all the 3 phases in one matrix X_signal = np.concatenate(X_signal, axis=1) # add the data to X X.append(X_signal) X = np.asarray(X) y = np.asarray(y) return X, y
def read_parquet(path, *args, **kwargs): """{docstring}""" warnings.warn("Using CPU via PyArrow to read Parquet dataset, this will " "be GPU accelerated in the future") pa_table = pq.read_pandas(path, *args, **kwargs) return DataFrame.from_arrow(pa_table)
def test_pandas_parquet_datetime_tz(): import pyarrow.parquet as pq s = pd.Series([datetime.datetime(2017, 9, 6)]) s = s.dt.tz_localize('utc') s.index = s # Both a column and an index to hit both use cases df = pd.DataFrame( { 'tz_aware': s, 'tz_eastern': s.dt.tz_convert('US/Eastern') }, index=s) f = BytesIO() arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, f, coerce_timestamps='ms') f.seek(0) table_read = pq.read_pandas(f) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def read_parquet(): start_time = time.time() train_df = pq.read_pandas(path / 'train.parquet').to_pandas() end_time = time.time() print(f'loading took {end_time-start_time} secs') train_df = train_df.T print(f'train_df.shape: {train_df.shape}') start_time = time.time() test_df = pq.read_pandas(path / 'test.parquet').to_pandas() end_time = time.time() print(f'loading took {end_time-start_time} secs') test_df = test_df.T print(f'test_df.shape: {test_df.shape}') return train_df, test_df
def prep_data(start, end): praq_train = pq.read_pandas(filepath + '/train.parquet', columns=[str(i) for i in range(start, end) ]).to_pandas() X = [] y = [] # using tdqm to evaluate processing time for id_measurement in tqdm( y_train.index.levels[0].unique()[int(start / 3):int(end / 3)]): # for each phase of the signal for phase in [0, 1, 2]: X_signal = [] signal_id, target = y_train.loc[id_measurement].loc[phase] y.append(target) X_signal.append(transform_ts(praq_train[str(signal_id)])) X_signal = np.concatenate(X_signal, axis=1) X.append(X_signal) # concatenate all the 3 phases in one matrix # X_signal = np.concatenate(X_signal, axis=1) # add the data to X # X.append(X_signal) X = np.asarray(X) y = np.asarray(y) return X, y
def _load_base_features_src(exp_ids, test, series_df, meta_df, logger): target_ids = [ 'e001', ] if len(set(target_ids) & set(exp_ids)) < 1: sel_log( f''' ======== {__name__} ======== Stop feature making because even 1 element in exp_ids {exp_ids} does not in target_ids {target_ids}''', logger) return None, None if test: series_path = './inputs/origin/test.parquet' meta_path = './inputs/origin/metadata_test.csv' else: series_path = './inputs/origin/train.parquet' meta_path = './inputs/origin/metadata_train.csv' # Load dfs if not input. if not series_df: sel_log(f'loading {series_path} ...', None) series_df = pq.read_pandas(series_path).to_pandas() if not meta_df: sel_log(f'loading {meta_path} ...', None) meta_df = pd.read_csv(meta_path) return series_df, meta_df
def test_fastparquet_cross_compatibility(tempdir): fp = pytest.importorskip('fastparquet') df = pd.DataFrame({ "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(4.0, 7.0, dtype="float64"), "d": [True, False, True], "e": pd.date_range("20130101", periods=3), "f": pd.Categorical(["a", "b", "a"]), # fastparquet writes list as BYTE_ARRAY JSON, so no roundtrip # "g": [[1, 2], None, [1, 2, 3]], }) table = pa.table(df) # Arrow -> fastparquet file_arrow = str(tempdir / "cross_compat_arrow.parquet") pq.write_table(table, file_arrow, compression=None) fp_file = fp.ParquetFile(file_arrow) df_fp = fp_file.to_pandas() tm.assert_frame_equal(df, df_fp) # Fastparquet -> arrow file_fastparquet = str(tempdir / "cross_compat_fastparquet.parquet") fp.write(file_fastparquet, df) table_fp = pq.read_pandas(file_fastparquet) # for fastparquet written file, categoricals comes back as strings # (no arrow schema in parquet metadata) df['f'] = df['f'].astype(object) tm.assert_frame_equal(table_fp.to_pandas(), df)
def append_raw_to_parquet(df, full_path, limit_to_today=True): """Takes raw df and appends it to an existing parquet file. If the file does not exist, it is created.""" df = polish_df(df, limit_to_today) try: df = pd.concat([pq.read_pandas(full_path).to_pandas(), df]) except OSError: pass df.to_parquet(full_path)
def _download_data(key_prefix, s3, bucket_name, prefix, sep): df_list = [] if prefix is False: file = s3.Object(bucket_name, key_prefix) try: data = StringIO(str(file.get()['Body'].read(), 'utf-8')) except UnicodeDecodeError: data = BytesIO(file.get()['Body'].read()) except ClientError: print('File not found on s3') return pd.DataFrame([]) try: df_list.append( pd.read_csv(data, error_bad_lines=False, warn_bad_lines=False, sep=sep)) except UnicodeDecodeError: df_list.append(pq.read_pandas(data).to_pandas()) else: bucket = s3.Bucket(bucket_name) try: for file in bucket.objects.filter(Prefix=key_prefix): if 'SUCCESS' not in file.key: data = StringIO(str(file.get()['Body'].read(), 'utf-8')) df_list.append( pd.read_csv(data, error_bad_lines=False, warn_bad_lines=False, sep=sep)) except UnicodeDecodeError: for file in bucket.objects.filter(Prefix=key_prefix): if 'SUCCESS' not in file.key: data = BytesIO(file.get()['Body'].read()) df_list.append(pq.read_pandas(data).to_pandas()) except ClientError: print('File not found on s3') return pd.DataFrame([]) if not df_list: print('No matching file found') return pd.DataFrame([]) data = pd.concat(df_list) return data
def load_retrieved(folder): """Load data, project report, and workflow report.""" names_to_ext = dict(x.split('.') for x in os.listdir(folder)) names = ('data', 'proj_rep', 'workflow_rep') return tuple([ pq.read_pandas(os.path.join(folder, f"{n}.{names_to_ext[n]}")).to_pandas() for n in names ])
def to_array(file): tmp_config = {} tmp_config["file"] = os.path.abspath(TEMP_INPUT_FOLDER + "/" + file) tmp_config["output"] = os.path.abspath(OUTPUT_FOLDER + "/" + file + "_output.txt") data = pq.read_pandas(os.path.abspath(INPUT_FOLDER + "/" + file)).to_pandas() pd.np.savetxt(tmp_config["file"], data.T.values, delimiter=",", fmt="%s") tmp_config["length"] = data.values.shape[0] return tmp_config
def test_read_pandas_column_subset(tmpdir): import pyarrow.parquet as pq df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = pa.BufferReader(buf) df_read = pq.read_pandas(reader, columns=['strings', 'uint8']).to_pandas() tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
def __init__(self): try: os.chdir(args.directory_load) if ".csv" in args.name: self.data = pd.read_csv(args.name) elif ".parquet" in args.name: self.data = pq.read_pandas(args.name).to_pandas() self.nalists = ["?", "na", "na", "null", "Null", "NULL", " "] self.output = dict() except: print("해당 파일이 존재하지 않습니다. 경로를 확인하세요.")
def readRawSignal_extractTestTarget(path, subset_size=50, start_id=8712, end_id=29049): relist = [] processFun = lambda x: extractTestTarget( pq.read_pandas( path, columns=[str(val) for val in range(start_id+x*subset_size, min(start_id+(x+1)*subset_size, end_id))]).to_pandas() ) multiProcess = mutiProcessLoop(processFun, range(math.ceil((end_id-start_id)/subset_size)), n_process=4, silence=False) resultlist = multiProcess.run() return pd.concat(resultlist)
def test_read_pandas_passthrough_keywords(tempdir): # ARROW-11464 - previously not all keywords were passed through (such as # the filesystem keyword) df = pd.DataFrame({'a': [1, 2, 3]}) filename = tempdir / 'data.parquet' _write_table(df, filename) result = pq.read_pandas('data.parquet', filesystem=SubTreeFileSystem( str(tempdir), LocalFileSystem())) assert result.equals(pa.table(df))
def test_read_pandas_column_subset(tempdir, use_legacy_dataset): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) df_read = pq.read_pandas( reader, columns=['strings', 'uint8'], use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
def test_pandas_parquet_2_0_rountrip(tmpdir): import pyarrow.parquet as pq df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True) assert b'pandas' in arrow_table.schema.metadata _write_table(arrow_table, filename.strpath, version="2.0") table_read = pq.read_pandas(filename.strpath) assert b'pandas' in table_read.schema.metadata assert arrow_table.schema.metadata == table_read.schema.metadata df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_pandas_parquet_column_multiindex(tmpdir): import pyarrow.parquet as pq df = alltypes_sample(size=10) df.columns = pd.MultiIndex.from_tuples( list(zip(df.columns, df.columns[::-1])), names=['level_1', 'level_2'] ) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata _write_table(arrow_table, filename.strpath, version="2.0", coerce_timestamps='ms') table_read = pq.read_pandas(filename.strpath) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tmpdir): import pyarrow.parquet as pq df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, preserve_index=False) js = json.loads(arrow_table.schema.metadata[b'pandas'].decode('utf8')) assert not js['index_columns'] _write_table(arrow_table, filename.strpath, version="2.0", coerce_timestamps='ms') table_read = pq.read_pandas(filename.strpath) js = json.loads(table_read.schema.metadata[b'pandas'].decode('utf8')) assert not js['index_columns'] assert arrow_table.schema.metadata == table_read.schema.metadata df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_pandas_parquet_datetime_tz(): import pyarrow.parquet as pq s = pd.Series([datetime.datetime(2017, 9, 6)]) s = s.dt.tz_localize('utc') s.index = s # Both a column and an index to hit both use cases df = pd.DataFrame({'tz_aware': s, 'tz_eastern': s.dt.tz_convert('US/Eastern')}, index=s) f = BytesIO() arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, f, coerce_timestamps='ms') f.seek(0) table_read = pq.read_pandas(f) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_read_multiple_files(tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) _write_table(table, path) test_data.append(table) paths.append(path) # Write a _SUCCESS.crc file with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f: f.write(b'0') def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): dataset = pq.ParquetDataset(paths, **kwargs) return dataset.read(columns=columns, nthreads=nthreads) result = read_multiple_files(paths) expected = pa.concat_tables(test_data) assert result.equals(expected) with pytest.raises(NotImplementedError): pq.read_pandas(dirpath) # Read with provided metadata metadata = pq.ParquetFile(paths[0]).metadata result2 = read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) assert result3.equals(expected) # Read column subset to_read = [result[0], result[2], result[6], result[result.num_columns - 1]] result = pa.localfs.read_parquet( dirpath, columns=[c.name for c in to_read]) expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata) assert result.equals(expected) # Read with multiple threads pa.localfs.read_parquet(dirpath, nthreads=2) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) bad_meta = pq.ParquetFile(bad_apple_path).metadata with pytest.raises(ValueError): read_multiple_files(paths + [bad_apple_path]) with pytest.raises(ValueError): read_multiple_files(paths, metadata=bad_meta) mixed_paths = [bad_apple_path, paths[0]] with pytest.raises(ValueError): read_multiple_files(mixed_paths, schema=bad_meta.schema) with pytest.raises(ValueError): read_multiple_files(mixed_paths)