def open(self, temp_path): self._file_handle = super().open(temp_path) if ARROW_MAJOR_VERSION < 4: return pq.ParquetWriter(self._file_handle, self._schema, compression=self._codec, use_deprecated_int96_timestamps=self. _use_deprecated_int96_timestamps) return pq.ParquetWriter( self._file_handle, self._schema, compression=self._codec, use_deprecated_int96_timestamps=self. _use_deprecated_int96_timestamps, use_compliant_nested_type=self._use_compliant_nested_type)
def test_it_handles_files_with_multiple_row_groups_and_pandas_indexes( mock_load_parquet, ): # Arrange data = [ { "customer_id": "12345" }, { "customer_id": "34567" }, ] columns = [{ "Column": "customer_id", "MatchIds": ["12345"], "Type": "Simple" }] df = pd.DataFrame(data, list("ab")) table = pa.Table.from_pandas(df) buf = BytesIO() # Create parquet with multiple row groups with pq.ParquetWriter(buf, table.schema) as writer: for i in range(3): writer.write_table(table) br = pa.BufferReader(buf.getvalue()) f = pq.ParquetFile(br, memory_map=False) mock_load_parquet.return_value = f # Act out, stats = delete_matches_from_parquet_file("input_file.parquet", columns) # Assert assert {"ProcessedRows": 6, "DeletedRows": 3} == stats res = pa.BufferReader(out.getvalue()) newf = pq.ParquetFile(res, memory_map=False) assert 3 == newf.num_row_groups assert 3 == newf.read().num_rows
def _write_buffer(self): buffer_table = [] if not isinstance(self.schema, ParquetSchema): self.schema = ParquetSchema.convert(self.schema) if not self.pq_writer: self.pq_writer = pq.ParquetWriter( self.options.outfile, self.schema.to_arrow() ) for col_name, col_data in self.column_buffer.items(): col_type = self.schema.columns[col_name].type_py pa_type = self.schema.columns[col_name].type_pa col = pa.column( col_name, pa.array( self.coerce_column(col_name, col_data, col_type), type=pa_type, ), ) buffer_table.append(col) self.pq_writer.write_table( pa.Table.from_arrays(buffer_table, schema=self.schema.to_arrow()) ) for col in self.column_buffer.keys(): self.column_buffer[col] = [] self.buffer_line = 0
def _chunk_readwrite(archive_url, dest_path, chunksize, header, encoding, dtype, dataset): """stream read and write archives pandas reads and parquet writes notes ----- * dest_path can be either a file.parquet, or in hte case of partitioned parquet it will be only the destination folder of the parquet partition files """ pqwriter = None header = [] for i, df in enumerate( pd.read_csv(archive_url, chunksize=chunksize, names=header, encoding=encoding, dtype=dtype)): table = pa.Table.from_pandas(df) if i == 0: if dataset: header = np.copy(table.schema) else: pqwriter = pq.ParquetWriter(dest_path, table.schema) if dataset: pq.write_to_dataset(table, root_path=dest_path, partition_cols=partition_cols) else: pqwriter.write_table(table) if pqwriter: pqwriter.close() return header
def parquet(): query = ''' select * from evcard_tmp.order_info where strleft(created_time,4)='2015' limit 10 ''' with get_impala_connection() as impala_conn: with impala_conn.cursor() as cursor: cursor.arraysize = 500 cursor.execute(query) _columns = [metadata[0] for metadata in cursor.description] writer = pq.ParquetWriter('city.1.parquet', table.schema) _page = 1 total_count = 0 df = as_pandas(cursor, _columns) while len(df) > 0: total_count += len(df) logger.debug('process page %s(%s)...', _page, total_count) table = pa.Table.from_pandas(df, preserve_index=False) writer.write_table(table) df = as_pandas(cursor, _columns) _page += 1
def write(self, gdf, shuffle=True): # Shuffle the dataframe gdf_size = len(gdf) if shuffle: sort_key = "__sort_index__" arr = cp.arange(gdf_size) cp.random.shuffle(arr) gdf[sort_key] = cudf.Series(arr) gdf = gdf.sort_values(sort_key).drop(columns=[sort_key]) # Write to chunk_size = int(gdf_size / self.nfiles) for i, fn in enumerate(FileIterator(self.path, self.nfiles)): s1 = i * chunk_size s2 = (i + 1) * chunk_size if i == (self.nfiles - 1): s2 = gdf_size chunk = gdf[s1:s2] pa_table = chunk.to_arrow() if self.writers[fn] is None: self.writers[fn] = pq.ParquetWriter( fn, pa_table.schema, metadata_collector=self.new_metadata[fn], ) self.writers[fn].write_table(pa_table)
def parquet_converter(file): chunksize = 500000 i = 0 data = pd.DataFrame() # creates a new dataframe that's empty for chunk in pd.read_csv(file, chunksize=chunksize, usecols=["End_Lon", "End_Lat"], dtype={ "End_Lon": np.float32, "End_Lat": np.float32 }, delimiter=' *, *', engine="python"): # chunk = chunk.rename(columns={"dropoff_latitude": "End_Lat", "dropoff_longitude": "End_Lon"}) table = pa.Table.from_pandas(chunk) # for the first chunk of records if i == 0: # create a parquet write object giving it an output file pqwriter = pq.ParquetWriter(target, table.schema, compression='snappy') pqwriter.write_table(table) # subsequent chunks can be written to the same file else: pqwriter.write_table(table) i += 1 # close the parquet writer if pqwriter: pqwriter.close()
def test_parquet_incremental_file_build(tmpdir): import pyarrow.parquet as pq df = _test_dataframe(100) df['unique_id'] = 0 arrow_table = pa.Table.from_pandas(df, preserve_index=False) out = pa.BufferOutputStream() writer = pq.ParquetWriter(out, arrow_table.schema, version='2.0') frames = [] for i in range(10): df['unique_id'] = i arrow_table = pa.Table.from_pandas(df, preserve_index=False) writer.write_table(arrow_table) frames.append(df.copy()) writer.close() buf = out.get_result() result = _read_table(pa.BufferReader(buf)) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected)
def collect_results(filepointer_item, outbase, compression, mutation_mode, kmer_list = None): if filepointer_item is not None: s1 = timeit.default_timer() if kmer_list: file_to_collect = filepointer_item['path'].values() else: file_to_collect = [filepointer_item['path']] for file_path in file_to_collect: file_name = os.path.basename(file_path) tmp_file_list = glob.glob(os.path.join(outbase, 'tmp_out_{}_[0-9]*'.format(mutation_mode), file_name)) tot_shape = 0 for tmp_file in tmp_file_list: try: table = pq.read_table(tmp_file) if tot_shape == 0: pqwriter = pq.ParquetWriter(file_path, table.schema, compression=compression) pqwriter.write_table(table) tot_shape += table.shape[0] except: logging.info("ERROR: file {} could not be read".format(tmp_file)) sys.exit(1) if tmp_file_list: pqwriter.close() logging.info('Collecting {} with {} lines. Took {} seconds'.format(file_name, tot_shape, timeit.default_timer()-s1))
def write_df_to_parquet(self, df, preserve_index=False, close_writer=True, schema=None): """Writes Pandas Dataframe to a new Parquet file Closes the writer after writing all of dataframe and before returning Args: df (Pandas Datafram): Data to be written to parquet file. preserve_index (bool): Set this to True if you want to preserve the index of pandas dataframe, default is False and indexes are dropped. close_writer (bool): Set to true closes the writer at the end Default: True schema (pyarrow.Schema, optional): The expected schema of the Arrow Table. This can be used to indicate the type of columns if we cannot infer it automatically. """ table = pa.Table.from_pandas(df, preserve_index=preserve_index, schema=schema) self._pqwriter = pq.ParquetWriter(self._output_file, table.schema) self._pqwriter.write_table(table) if close_writer: self.close()
def parquet_write(write_path, block): block = BlockAccessor.for_block(block) logger.debug( f"Writing {block.num_rows()} records to {write_path}.") table = block.to_arrow_table() with pq.ParquetWriter(write_path, table.schema) as writer: writer.write_table(table)
def from_ticker_to_parquet(): names = ('timestamp', 'exchange_name', 'pair', 'rate') files = listdir(TICKERS_DIR) files.sort(reverse=True) count = 0 for file_name in files: if file_name[:7] != 'ticker_': continue count += 1 if count <= 3: continue #pass last 3 files (still can be open for writing) with open(TICKERS_DIR + '/' + file_name, 'r') as f: batch = defaultdict(list) data_string = f.read().replace('\n', ',') tickers = loads('[' + data_string[:len(data_string) - 1] + ']') for t in tickers: for n in names: if n == 'rate': value = float(t[n]) else: value = t[n] batch[n].append(value) tables = pa.Table.from_arrays([pa.array(batch[n]) for n in names], names) with pq.ParquetWriter(TICKERS_DIR + '/ticker.parquet', tables.schema, use_dictionary=False, flavor={'spark'}) as writer: writer.write_table(tables)
def test_write_compliant_nested_type_enable(tempdir, use_legacy_dataset, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write pandas df with new flag _roundtrip_pandas_dataframe( df, write_kwargs={'use_compliant_nested_type': True}, use_legacy_dataset=use_legacy_dataset) # Write to a parquet file with compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') with pq.ParquetWriter(path, table.schema, use_compliant_nested_type=True, version='2.0') as writer: writer.write_table(table) # Read back as a table new_table = _read_table(path) # Validate that "items" columns compliant to Parquet nested format # Should be like this: list<element: struct<name: string, value: string>> assert isinstance(new_table.schema.types[0], pa.ListType) assert new_table.schema.types[0].value_field.name == 'element' # Verify that the new table can be read/written correctly _check_roundtrip(new_table, use_legacy_dataset=use_legacy_dataset, use_compliant_nested_type=True)
def execute(self, df): df = df.copy() bucket = 'iotcs-as-bucket' path = 's3test-alberto-%s.parquet' % (dt.datetime.now().isoformat()) bucket_uri = '{bucket}/{path}'.format(**{ 'bucket': bucket, 'path': path }) client_kwargs = {"endpoint_url": self.cos_credentials["endpoint_url"]} fs = s3fs.S3FileSystem( key=self.cos_credentials["cos_hmac_keys"]["access_key_id"], secret=self.cos_credentials["cos_hmac_keys"]["secret_access_key"], client_kwargs=client_kwargs) sink = fs.open(bucket_uri, 'wb') df2 = [] for sens in df[self.sens_pos].drop_duplicates(): df1 = pd.concat([ pd.Series([ int(element) for list_ in df[df[self.sens_pos] == sens] [axis].str.split(',').values for element in list_ ]).rename(axis) for axis in [self.X, self.Y, self.Z] ], axis=1) df1[self.sens_pos] = sens df2.append(df1) df2 = pd.concat(df2, ignore_index=True) ta = pa.Table.from_pandas(df2) pw = pq.ParquetWriter(sink, schema=ta.schema) pw.write_table(ta) df[self.output_status] = bucket_uri return df
def to_parquet_align(self, output_dir=None, output_prefix='d6tstack-', write_params={}): """ Same as `to_csv_align` but outputs parquet files """ # write_params for pyarrow.parquet.write_table # stream all chunks to multiple files self._combine_preview_available() import pyarrow as pa import pyarrow.parquet as pq fnamesout = [] pqschema = pa.Table.from_pandas(self.df_combine_preview).schema for fname in self.fname_list: filename = self._get_filepath_out(fname, output_dir, output_prefix, '.pq') if self.logger: self.logger.send_log('writing '+filename , 'ok') pqwriter = pq.ParquetWriter(filename, pqschema) for dfc in self._read_csv_yield(fname, self.read_csv_params): pqwriter.write_table(pa.Table.from_pandas(dfc.astype(self.df_combine_preview.dtypes), schema=pqschema),**write_params) pqwriter.close() fnamesout.append(filename) return fnamesout
def write_files(metadata: AlchemyMetadata) -> None: """ Creates a Parquet file for each table in the schema. """ tables: Iterator[AlchemyTable] = metadata.tables.values() for table in tables: name = table.name print(name) def get_path(prefix: Path, suffix: str): parent_dir = prefix.joinpath(metadata.schema) parent_dir.mkdir(exist_ok=True, parents=True) return parent_dir.joinpath(name).with_suffix(suffix) extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst") parquet_file = get_path(PARQUET_PREFIX, ".parquet") arrow_schema = pa.schema(get_fields(table)) column_names = [name for name, dtype in get_fields(table)] read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000) parse_options = pcsv.ParseOptions(newlines_in_values=True) convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"], true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True) parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='zstd', version="2.0", use_dictionary=True) stream_reader = pcsv.open_csv(extract_file, read_options=read_options, parse_options=parse_options, convert_options=convert_options) for batch in stream_reader: table = pa.Table.from_batches([batch]) parquet_writer.write_table(table) parquet_writer.close()
def test_merging_parquet_tables_with_different_pandas_metadata(tempdir): # ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch schema = pa.schema([ pa.field('int', pa.int16()), pa.field('float', pa.float32()), pa.field('string', pa.string()) ]) df1 = pd.DataFrame({ 'int': np.arange(3, dtype=np.uint8), 'float': np.arange(3, dtype=np.float32), 'string': ['ABBA', 'EDDA', 'ACDC'] }) df2 = pd.DataFrame({ 'int': [4, 5], 'float': [1.1, None], 'string': [None, None] }) table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False) table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False) assert not table1.schema.equals(table2.schema, check_metadata=True) assert table1.schema.equals(table2.schema) writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema) writer.write_table(table1) writer.write_table(table2)
def csv_to_parquet(): logger.info(f'Starting...') stream = pd.read_csv( CSV_FILE_PATH, chunksize=CHUNK_SIZE, low_memory=False, sep=',', encoding='latin-1', ) logger.info( f'CSV Stored Size: {CSV_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB') chunk = next(stream) logger.debug(f'Processing 1-th chunk...') parquet_schema = pa.Table.from_pandas(chunk).schema parquet_writer = pq.ParquetWriter(PARQUET_FILE_PATH, parquet_schema, compression='snappy') for i, chunk in enumerate(stream, 2): logger.debug(f'Processing {i}-th chunk...') table = pa.Table.from_pandas(chunk, parquet_schema) parquet_writer.write_table(table) parquet_writer.close() logger.info( f'Parquet Stored Size: {PARQUET_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB' ) logger.info(f'Finished!')
def test_parquet_writer_context_obj_with_exception(tempdir, use_legacy_dataset): df = _test_dataframe(100) df['unique_id'] = 0 arrow_table = pa.Table.from_pandas(df, preserve_index=False) out = pa.BufferOutputStream() error_text = 'Artificial Error' try: with pq.ParquetWriter(out, arrow_table.schema, version='2.0') as writer: frames = [] for i in range(10): df['unique_id'] = i arrow_table = pa.Table.from_pandas(df, preserve_index=False) writer.write_table(arrow_table) frames.append(df.copy()) if i == 5: raise ValueError(error_text) except Exception as e: assert str(e) == error_text buf = out.getvalue() result = _read_table(pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected)
def open(self, temp_path): self._file_handle = super(_ParquetSink, self).open(temp_path) return pq.ParquetWriter( self._file_handle, self._schema, compression=self._codec, use_deprecated_int96_timestamps=self._use_deprecated_int96_timestamps)
def stream_json(fn, parquet_fn, schema=None, chunk_size=10000000): if isinstance(fn, str): fn = [fn] if schema is None: schema = read_json(fn[0]).schema writer = pq.ParquetWriter(parquet_fn, schema) for _f in fn: check_gz = _f.endswith('.gz') if check_gz: f = gzip.open(_f, 'r') else: f = open(_f, 'r') while True: chunk = f.readlines(chunk_size) if not chunk: break tbl = read_json(io.BytesIO(''.join(chunk).encode())) assert tbl.schema == schema # make sure the read table schema is the same as the parsed schema writer.write_table(tbl) f.close() writer.close()
def compress_descriptions(encoding='utf-8', batch_size=1000, compression='BROTLI'): """Convert tarfile to parquet""" names = ('symbol', 'html') def read_incremental(): """Incremental generator of batches""" with tarfile.open(YAHOO_ARCH) as archive: batch = defaultdict(list) for member in tqdm(archive): if member.isfile() and member.name.endswith('.html'): batch['symbol'].append(Path(member.name).stem) batch['html'].append( archive.extractfile(member).read().decode(encoding)) if len(batch['symbol']) >= batch_size: yield pa.Table.from_arrays( [pa.array(batch[n]) for n in names], names) batch = defaultdict(list) if batch: yield pa.Table.from_arrays([pa.array(batch[n]) for n in names], names) # last partial batch writer = None for batch in read_incremental(): if writer is None: writer = pq.ParquetWriter(YAHOO_PARQUET, batch.schema, use_dictionary=False, compression=compression, flavor={'spark'}) writer.write_table(batch) writer.close()
def convert_csv_file_to_typed_parquet_file(csv_file, parquet_file, compression, schema, ntimes=1): table = read_parquet_from_csv_file(csv_file) table = table.cast(schema) pqwriter = pq.ParquetWriter(parquet_file, schema=schema, compression=compression) for i in range(0, ntimes): pqwriter.write_table(table) pqwriter.close()
def concatenate_files(files, output_path, variant_metadata_path, frequency): vmf = pq.ParquetFile(variant_metadata_path) if variant_metadata_path else None columns = [] for i,file_path in enumerate(files): logging.info(file_path) p = pq.ParquetFile(file_path) if vmf is not None and frequency is not None: cleared = get_cleared_columns(vmf, frequency, i) logging.info("%d cleared snps", len(cleared)) t = p.read(columns=cleared) else: t = p.read() from_ = 0 if i==0 else 1 for c_ in range(from_, t.num_columns): columns.append(t.column(c_)) logging.info("Creating table") table = pa.Table.from_arrays(columns) logging.info("saving...") o = pq.ParquetWriter(output_path, table.schema, flavor="spark") o.write_table(table) o.close() logging.info("finished.")
def test_write_compliant_nested_type_disable(tempdir, use_legacy_dataset, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write with new flag disabled (default behaviour) _roundtrip_pandas_dataframe(df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset) # Write to a parquet file while disabling compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') with pq.ParquetWriter(path, table.schema, version='2.6') as writer: writer.write_table(table) new_table = _read_table(path) # Validate that "items" columns is not compliant to Parquet nested format # Should be like this: list<item: struct<name: string, value: string>> assert isinstance(new_table.schema.types[0], pa.ListType) assert new_table.schema.types[0].value_field.name == 'item' # Verify that the new table can be read/written correctly _check_roundtrip(new_table, use_legacy_dataset=use_legacy_dataset, use_compliant_nested_type=False)
def generate_parquet_file( cls, name: str, columns: Mapping[str, str], num_rows: int, custom_rows: Mapping[int, List[str]] = None ) -> str: """Generates a random data and save it to a tmp file""" filename = os.path.join(TMP_FOLDER, name + "." + cls.filetype) pq_writer = None types = list(columns.values()) if num_rows else [] custom_rows = custom_rows or {} column_names = list(columns.keys()) buffer = [] for i in range(num_rows): buffer.append(custom_rows.get(i) or cls._generate_row(types)) if i != (num_rows - 1) and len(buffer) < 100: continue data = {col_values[0]: list(col_values[1:]) for col_values in zip(column_names, *buffer)} buffer = [] df = pd.DataFrame(data) table = pa.Table.from_pandas(df) if not pq_writer: pq_writer = pq.ParquetWriter(filename, table.schema) pq_writer.write_table(table, row_group_size=100) if not pq_writer: pq.write_table(pa.Table.from_arrays([]), filename) return filename
def append_to_parquet(df: pd.DataFrame, writer: pq.ParquetWriter, filepath: str) -> pq.ParquetWriter: table = pa.Table.from_pandas(str) if writer is None: writer = pq.ParquetWriter(filepath, table.schema) writer.write_table(table=table) return writer
def create_parquet(): myschema = define_schema() wikifiles = findwikifiles() for wikifile in wikifiles: outfilename = get_outfilename_parquet(wikifile) if not os.path.exists(outfilename): writer = pq.ParquetWriter(outfilename, schema=myschema) titles = [] pageids = [] sentences = [] linked_pages = [] descriptions = [] for i, (title, pageid, text) in enumerate(process_file(wikifile)): if i % 100 == 99: logging.info(f"processing {title}") records = get_single_record(title, pageid, text) for record in records: titles.append(record[0]) pageids.append(record[1]) sentences.append(record[2]) linked_pages.append(record[3]) descriptions.append(record[4]) t = pa.Table.from_arrays( [titles, pageids, sentences, linked_pages, descriptions], schema=myschema) writer.write_table(t) writer.close()
def df_to_parquet(df, filename, workdir=None, chunksize=100000, debug=False): if workdir: full_filename = os.path.join(workdir, filename) else: full_filename = filename writer = None # check if we are overwriting an existing file if os.path.exists(full_filename): os.remove(full_filename) i = 0 # write in chunksizes while len(df) >= chunksize: # select data if debug: print('Writing ' + str(i) + '-' + str(i + chunksize)) i += chunksize data_table = pa.Table.from_pandas(df[0:chunksize], preserve_index=False) df = df[chunksize:] # create writer if we did not have one yet if writer is None: writer = pq.ParquetWriter(full_filename, data_table.schema, compression='ZSTD') # save result writer.write_table(data_table) # save dangling results if not df.empty: if debug: print('Writing ' + str(i) + '-' + str(i + len(df))) data_table = pa.Table.from_pandas(df, preserve_index=False) if writer is None: writer = pq.ParquetWriter(full_filename, data_table.schema, compression='ZSTD') writer.write_table(data_table) # close the writer if we made one if writer is not None: writer.close() # cleanup del df
def update_files(tables, writers, prefix): schemas = get_schemas(); for table_name in ["variants", "annotations", "gts"]: df = pd.DataFrame(tables[table_name]) table = pa.Table.from_pandas(df, schema=schemas[table_name], preserve_index=False) if not writers[table_name]: writers[table_name] = pq.ParquetWriter(f"{prefix}_{table_name}.parquet", schema=schemas[table_name], compression='snappy') writers[table_name].write_table(table=table)