def test_numpy_base_object(tmpdir): # ARROW-2040: deserialized Numpy array should keep a reference to the # owner of its memory path = os.path.join(str(tmpdir), 'zzz.bin') data = np.arange(12, dtype=np.int32) with open(path, 'wb') as f: f.write(pa.serialize(data).to_buffer()) serialized = pa.read_serialized(pa.OSFile(path)) result = serialized.deserialize() assert_equal(result, data) serialized = None assert_equal(result, data) assert result.base is not None
def write_files(metadata: AlchemyMetadata) -> None: """ Creates a Parquet file for each table in the schema. """ tables: Iterator[AlchemyTable] = metadata.tables.values() for table in tables: name = table.name print(name) def get_path(prefix: Path, suffix: str): parent_dir = prefix.joinpath(metadata.schema) parent_dir.mkdir(exist_ok=True, parents=True) return parent_dir.joinpath(name).with_suffix(suffix) extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst") parquet_file = get_path(PARQUET_PREFIX, ".parquet") pandas_fields = get_pandas_fields(table) arrow_fields = get_arrow_fields(table) arrow_schema = pa.schema(get_arrow_fields(table)) column_names = [name for name, dtype in pandas_fields] date_cols = [ name for name, dtype in arrow_fields if "timestamp" in dtype ] # Using both Arrow and Pandas allows each library to cover the other's current shortcomings. # Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes. # Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet. in_buf = pa.OSFile(str(extract_file), mode="r") reader = pa.CompressedInputStream(in_buf, compression="zstd") # Have to use snappy codec for Parquet because Drill doesn't read zstd parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='snappy', version="2.0", use_dictionary=True) df_iterator: TextFileReader = pd.read_csv( reader, header=None, names=column_names, dtype=dict(pandas_fields), true_values=map_to_bytes('T'), false_values=map_to_bytes('F'), chunksize=BUFFER_SIZE_ROWS, parse_dates=date_cols) chunked_write(df_iterator, parquet_writer, date_cols)
def __init__( self, schema: Optional[pa.Schema] = None, features: Optional[Features] = None, path: Optional[str] = None, stream: Optional[pa.NativeFile] = None, fingerprint: Optional[str] = None, writer_batch_size: Optional[int] = None, disable_nullable: bool = False, update_features: bool = False, with_metadata: bool = True, unit: str = "examples", ): if path is None and stream is None: raise ValueError( "At least one of path and stream must be provided.") if features is not None: self._features = features self._schema = pa.schema(features.type) elif schema is not None: self._schema: pa.Schema = schema self._features = Features.from_arrow_schema(self._schema) else: self._features = None self._schema = None if disable_nullable and self._schema is not None: self._schema = pa.schema( pa.field(field.name, field.type, nullable=False) for field in self._schema) self._path = path if stream is None: self.stream = pa.OSFile(self._path, "wb") else: self.stream = stream self.fingerprint = fingerprint self.disable_nullable = disable_nullable self.writer_batch_size = writer_batch_size or DEFAULT_MAX_BATCH_SIZE self.update_features = update_features self.with_metadata = with_metadata self.unit = unit self._num_examples = 0 self._num_bytes = 0 self.current_rows = [] self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
def export(dataset, path, column_names=None, byteorder="=", shuffle=False, selection=False, progress=None, virtual=True, sort=None, ascending=True): table = _export_table(dataset, column_names, byteorder, shuffle, selection, progress, virtual, sort, ascending) b = table.to_batches() with pa.OSFile(path, 'wb') as sink: writer = pa.RecordBatchStreamWriter(sink, b[0].schema) writer.write_table(table)
def test_read_year_month_nano_interval(tmpdir): """ARROW-15783: Verify to_pandas works for interval types. Interval types require static structures to be enabled. This test verifies that they are when no other library functions are invoked. """ mdn_interval_type = pa.month_day_nano_interval() schema = pa.schema([pa.field('nums', mdn_interval_type)]) path = tmpdir.join('file.arrow').strpath with pa.OSFile(path, 'wb') as sink: with pa.ipc.new_file(sink, schema) as writer: interval_array = pa.array([(1, 2, 3)], type=mdn_interval_type) batch = pa.record_batch([interval_array], schema) writer.write(batch) invoke_script('read_record_batch.py', path)
def make_arrow(root, dataset_root): for split in ["val", "train"]: with open(f"{root}/{split}_annot.json", "r") as fp: captions = json.load(fp) iid2captions = dict() for cap in tqdm(captions): iid = cap[0].split("/")[-1] iid2captions[iid] = [cap[1]] paths = list(glob(f"{root}/images_{split}/*/*")) random.shuffle(paths) caption_paths = [ path for path in paths if path.split("/")[-1] in iid2captions ] if len(paths) == len(caption_paths): print("all images have caption annotations") else: print("not all images have caption annotations") print( len(paths), len(caption_paths), len(iid2captions), ) sub_len = int(len(caption_paths) // 100000) subs = list(range(sub_len + 1)) for sub in subs: sub_paths = caption_paths[sub * 100000:(sub + 1) * 100000] bs = [path2rest(path, iid2captions) for path in tqdm(sub_paths)] dataframe = pd.DataFrame( bs, columns=["image", "caption", "image_id", "split"], ) table = pa.Table.from_pandas(dataframe) os.makedirs(dataset_root, exist_ok=True) with pa.OSFile( f"{dataset_root}/conceptual_caption_{split}_{sub}.arrow", "wb") as sink: with pa.RecordBatchFileWriter(sink, table.schema) as writer: writer.write_table(table) del dataframe del table del bs gc.collect()
def open(path, mode='rb', fs_options={}, fs=None, for_arrow=False, mmap=False, encoding="utf8"): if is_file_object(path): return path fs, path = parse(path, fs_options=fs_options, fs=fs, for_arrow=for_arrow) if fs is None: path = stringyfy(path) if for_arrow: if fs_options: raise ValueError( f'fs_options not supported for local files. You passed: {repr(fs_options)}.' ) if mmap: return pa.memory_map(path, mode) else: return pa.OSFile(path, mode) else: if 'b' not in mode: return normal_open(path, mode, encoding=encoding) else: return normal_open(path, mode) if mode == 'rb': def create(): return fs.open_input_file(path) elif mode == "r": def create(): return io.TextIOWrapper(fs.open_input_file(path), encoding=encoding) elif mode == 'wb': def create(): return fs.open_output_stream(path) elif mode == "w": def create(): return io.TextIOWrapper(fs.open_output_stream(path), encoding=encoding) else: raise ValueError(f'Only mode=rb/bw/r/w are supported, not {mode}') return FileProxy(create(), path, create)
def dump_cache(self, cache_path: Optional[str] = None) -> str: """ Saves this dataset at cache_path. Dumped datasets can be loaded with the DiskBackedDataset.load_cache static method. All fields contained in this dataset must be serializable using pickle. Parameters ---------- cache_path: Optional[str] Path to the directory where the cache file will saved. The whole directory will be used as the cache and will be deleted when `delete_cache` is called. It is recommended to create a new directory to use exclusively as the cache, or to leave this as None. If None, a temporary directory will be created. Returns ------- str The chosen cache directory path. Useful when cache_path is None and a temporary directory is created. """ if cache_path == self.cache_path: raise ValueError( "Cache path same as datasets cache path. " f"Dataset can't overwrite its own cache. Cache path: {cache_path}" ) if cache_path is None: cache_path = tempfile.mkdtemp(prefix=TEMP_CACHE_FILENAME_PREFIX) if not os.path.isdir(cache_path): os.mkdir(cache_path) # pickle fields cache_fields_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME) with open(cache_fields_path, "wb") as fields_cache_file: pickle.dump(self.fields, fields_cache_file) # dump table cache_table_path = os.path.join(cache_path, CACHE_TABLE_FILENAME) with pa.OSFile(cache_table_path, "wb") as f: with pa.RecordBatchFileWriter(f, self.table.schema) as writer: writer.write(self.table) return cache_path
def __init__( self, data_type: Optional[pa.DataType] = None, schema: Optional[pa.Schema] = None, path: Optional[str] = None, stream: Optional[pa.NativeFile] = None, writer_batch_size: Optional[int] = None, disable_nullable: bool = True, ): if path is None and stream is None: raise ValueError( "At least one of path and stream must be provided.") if data_type is not None: self._type: pa.DataType = data_type self._schema: pa.Schema = pa.schema(field for field in self._type) elif schema is not None: self._schema: pa.Schema = schema self._type: pa.DataType = pa.struct(field for field in self._schema) else: self._schema = None self._type = None if disable_nullable and self._schema is not None: self._schema = pa.schema( pa.field(field.name, field.type, nullable=False) for field in self._type) self._type = pa.struct( pa.field(field.name, field.type, nullable=False) for field in self._type) self._path = path if stream is None: self.stream = pa.OSFile(self._path, "wb") else: self.stream = stream self.writer_batch_size = writer_batch_size or DEFAULT_MAX_BATCH_SIZE self._num_examples = 0 self._num_bytes = 0 self.current_rows = [] self._build_writer(schema=self._schema)
def write_backing_store_from_ensemble_dataframe( storage_dir: Path, storage_key: str, ensemble_df: pd.DataFrame ) -> None: table = pa.Table.from_pandas(ensemble_df, preserve_index=False) # The input DF may contain an ENSEMBLE column (which we'll drop before writing), # but it is probably an error if there is more than one unique value in it if "ENSEMBLE" in ensemble_df: if ensemble_df["ENSEMBLE"].nunique() > 1: raise KeyError("Input data contains more than one unique ensemble name") table = table.drop(["ENSEMBLE"]) # Write to arrow format arrow_file_name: Path = storage_dir / (storage_key + ".arrow") with pa.OSFile(str(arrow_file_name), "wb") as sink: with pa.RecordBatchFileWriter(sink, table.schema) as writer: writer.write_table(table)
def __init__(self, size_vocabulary, embeddings, path, is_train, n_classes=3,data_percentage=1.0): base_name = os.path.basename(path) if is_train: saved_input_filename = "%s/%s-%d-train.pkl" % (path, base_name, n_classes) else: saved_input_filename = "%s/%s-%d-test.pkl" % (path, base_name, n_classes) if os.path.exists(saved_input_filename): input_file = open(saved_input_filename, 'rb') buf = input_file.read() all_data_node_id, all_data_node_type = pyarrow.deserialize(buf) input_file.close() else: all_data_node_id, all_data_node_type = load_program_graphs_from_directory(path, is_train,n_classes,data_percentage) all_data_node_id = np.array(all_data_node_id)[0:len(all_data_node_id)] all_data_node_type = np.array(all_data_node_type)[0:len(all_data_node_type)] buf = pyarrow.serialize((all_data_node_id, all_data_node_type)).to_buffer() out = pyarrow.OSFile(saved_input_filename, 'wb') out.write(buf) out.close() self.pretrained_embeddings = embeddings # print(all_data_node_id) if is_train == True: print("Number of all training data : " + str(len(all_data_node_id))) else: print("Number of all testing data : " + str(len(all_data_node_id))) self.n_edge_types = find_max_edge_id(all_data_node_id) # print("Edge types : " + str(self.n_edge_types)) max_node_id = find_max_node_id(all_data_node_id) max_node_type = find_max_node_id(all_data_node_type) print("Max node id : " + str(max_node_id)) print("Max node type : " + str(max_node_type)) # self.n_node = size_vocabulary self.n_node_by_id = max_node_id self.n_node = max_node_id # set n_node = n_node_by_id self.n_node_by_type = max_node_type all_data_node_id = convert_program_data(all_data_node_id,1, self.n_node_by_id) all_data_node_type = convert_program_data(all_data_node_type,1, self.n_node_by_type) self.all_data_node_id = all_data_node_id self.all_data_node_type = all_data_node_type self.data = all_data_node_id
def make_arrow(root, dataset_root): with open(f"{root}/karpathy/dataset_flickr30k.json", "r") as fp: captions = json.load(fp) captions = captions["images"] iid2captions = defaultdict(list) iid2split = dict() for cap in tqdm(captions): filename = cap["filename"] iid2split[filename] = cap["split"] for c in cap["sentences"]: iid2captions[filename].append(c["raw"]) paths = list(glob(f"{root}/flickr30k-images/*.jpg")) random.shuffle(paths) caption_paths = [path for path in paths if path.split("/")[-1] in iid2captions] if len(paths) == len(caption_paths): print("all images have caption annotations") else: print("not all images have caption annotations") print( len(paths), len(caption_paths), len(iid2captions), ) bs = [path2rest(path, iid2captions, iid2split) for path in tqdm(caption_paths)] for split in ["train", "val", "test"]: batches = [b for b in bs if b[-1] == split] dataframe = pd.DataFrame( batches, columns=["image", "caption", "image_id", "split"], ) table = pa.Table.from_pandas(dataframe) os.makedirs(dataset_root, exist_ok=True) with pa.OSFile( f"{dataset_root}/f30k_caption_karpathy_{split}.arrow", "wb" ) as sink: with pa.RecordBatchFileWriter(sink, table.schema) as writer: writer.write_table(table)
def test_arrow_chunk(scidb_con, url): prefix = 'arrow_chunk' url = '{}/{}'.format(url, prefix) schema = '<v:int64> [i=0:999:0:1000]' # Store # if url.startswith('s3://'): scidb_con.iquery(""" xsave( build({}, i), '{}')""".format(schema, url)) # Re-write one SciDB Chunk file to use multiple Arrow Chunks if url.startswith('s3://'): s3_key = '{}/{}/chunks/c_0'.format(base_prefix, prefix) obj = s3_con.get_object(Bucket=s3_bucket, Key=s3_key) reader = pyarrow.ipc.open_stream(obj['Body'].read()) elif url.startswith('file://'): fn = '{}/{}/chunks/c_0'.format(fs_base, prefix) reader = pyarrow.open_stream(pyarrow.OSFile(fn)) tbl = reader.read_all() if url.startswith('s3://'): sink = pyarrow.BufferOutputStream() writer = pyarrow.ipc.RecordBatchStreamWriter(sink, tbl.schema) elif url.startswith('file://'): writer = pyarrow.ipc.RecordBatchStreamWriter(fn, tbl.schema) batches = tbl.to_batches(max_chunksize=200) # 1000 / 200 = 5 chunks writer.write_table(pyarrow.Table.from_batches(batches)) writer.close() if url.startswith('s3://'): s3_con.put_object(Body=sink.getvalue().to_pybytes(), Bucket=s3_bucket, Key=s3_key) # Input que = "xinput('{}')".format(url) with pytest.raises(requests.exceptions.HTTPError): array = scidb_con.iquery(que, fetch=True)
def open_for_arrow(path, mode='rb', fs_options={}, mmap=False): '''When the file will be passed to arrow, we want file object arrow likes. This might avoid peformance issues with GIL, or call overhead. ''' import pyarrow as pa if is_file_object(path): return path path = stringyfy(path) scheme, _ = split_scheme(path) if scheme is None: if fs_options: raise ValueError(f'fs_options not supported for local files. You passed: {repr(fs_options)}.') if mmap: return pa.memory_map(path, mode) else: return pa.OSFile(path, mode) else: return open(path, mode=mode, fs_options=fs_options).file
def __init__(self, data_type: Optional[pa.DataType] = None, schema: Optional[pa.Schema] = None, path: Optional[str] = None, stream: Optional[pa.NativeFile] = None, writer_batch_size: Optional[int] = None, disable_nullable: bool = True): if data_type is None and schema is None: raise ValueError( "At least one of data_type and schema must be provided.") if path is None and stream is None: raise ValueError( "At least one of path and stream must be provided.") if data_type is not None: self._type: pa.DataType = data_type self._schema: pa.Schema = pa.schema(field for field in self._type) else: self._schema: pa.Schema = schema self._type: pa.DataType = pa.struct(field for field in self._schema) if disable_nullable: self._schema = pa.schema( pa.field(field.name, field.type, nullable=False) for field in self._type) self._type = pa.struct( pa.field(field.name, field.type, nullable=False) for field in self._type) self._path = path if stream is None: self.stream = pa.OSFile(self._path, 'wb') else: self.stream = stream self.writer = pa.RecordBatchStreamWriter(self.stream, self._schema) self.writer_batch_size = writer_batch_size self._num_examples = 0 self._num_bytes = 0 self.current_rows = []
def _output_to_disk(obj: pa.Buffer, full_path: str, serialization: Serialization) -> None: """Outputs a serialized object to disk to the specified path. Args: obj: Object to output to disk. full_path: Full path to save the data to. serialization: Serialization of the `obj`. For possible values see :class:`Serialization`. Raises: ValueError: If the specified serialization is not valid. """ if isinstance(serialization, Serialization): with pa.OSFile(f"{full_path}.{serialization.name}", "wb") as f: f.write(obj) else: raise ValueError("Function not defined for specified 'serialization'") return
def test_numpy_matrix_serialization(tmpdir): class CustomType(object): def __init__(self, val): self.val = val path = os.path.join(str(tmpdir), 'pyarrow_npmatrix_serialization_test.bin') array = np.random.randint(low=-1, high=1, size=(2, 2)) for data_type in [str, int, float, CustomType]: matrix = np.matrix(array.astype(data_type)) with open(path, 'wb') as f: f.write(pa.serialize(matrix).to_buffer()) serialized = pa.read_serialized(pa.OSFile(path)) result = serialized.deserialize() assert_equal(result, matrix) assert_equal(result.dtype, matrix.dtype) serialized = None assert_equal(result, matrix) assert result.base is not None
def to_arrow(file, arrow_file, chunksize=2000000, crs=None, **kwargs): """ Converts a spatial vector file into an arrow binary file. In case of CSV it uses pyarrow CSV reader, otherwise GDAL is used in order to parse the file. Parameters: file (string): The input file full path. arrow_file (string): The full path of output arrow file. chunksize (int): The chunksize of the file that is read in each iteration (default: 2000000) crs (string): The native CRS of the spatial file (optional) """ with pa.OSFile(arrow_file, 'wb') as sink: writer = None for table in to_arrow_table(file, chunksize=chunksize, crs=crs, **kwargs): b = table.to_batches() if writer is None: writer = pa.RecordBatchStreamWriter(sink, b[0].schema) writer.write_table(table) sink.close()
def test_put_arrow(test_client): """ Convert a simple json object to arrow, and write it to the datastore. """ jdf = [{"a":1,"b":10},{"a":2,"b":20}] buf = json_to_arrow(jdf) assert(isinstance(buf, bytes)) cell_hash = "test3" frame_name = str(uuid.uuid4()) response = test_client.put('/{}/{}'.format(cell_hash, frame_name),data=buf, content_type='application/octet-stream') assert(response.status_code == 200) assert(os.path.exists('{}/{}/{}'.format(storage_backend.store.dirname, cell_hash, frame_name))) f = pa.OSFile('{}/{}/{}'.format(storage_backend.store.dirname, cell_hash, frame_name)) buf = f.read_buffer(10000) new_jdf = json.loads(arrow_to_json(buf)) assert(new_jdf == jdf)
def test_native_file_raises_ValueError_after_close(tmpdir): path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(b'foooo') with pa.OSFile(path, mode='rb') as os_file: assert not os_file.closed assert os_file.closed with pa.memory_map(path, mode='rb') as mmap_file: assert not mmap_file.closed assert mmap_file.closed files = [os_file, mmap_file] methods = [('tell', ()), ('seek', (0, )), ('size', ()), ('flush', ()), ('readable', ()), ('writable', ()), ('seekable', ())] for f in files: for method, args in methods: with pytest.raises(ValueError): getattr(f, method)(*args)
def make_arrow(root, dataset_root): with open(f"{root}/annotations/region_descriptions.json", "r") as fp: captions = json.load(fp) iid2captions = defaultdict(list) for cap in tqdm(captions): cap = cap["regions"] for c in cap: iid2captions[c["image_id"]].append(c) paths = list(glob(f"{root}/images/VG_100K/*.jpg")) + list( glob(f"{root}/images/VG_100K_2/*.jpg")) random.shuffle(paths) caption_paths = [ path for path in paths if int(path.split("/")[-1][:-4]) in iid2captions ] if len(paths) == len(caption_paths): print("all images have caption annotations") else: print("not all images have caption annotations") print( len(paths), len(caption_paths), len(iid2captions), ) bs = [path2rest(path, iid2captions) for path in tqdm(caption_paths)] dataframe = pd.DataFrame( bs, columns=["image", "caption", "width", "height", "x", "y", "image_id"], ) table = pa.Table.from_pandas(dataframe) os.makedirs(dataset_root, exist_ok=True) with pa.OSFile(f"{dataset_root}/vg.arrow", "wb") as sink: with pa.RecordBatchFileWriter(sink, table.schema) as writer: writer.write_table(table)
def make_arrow(root, dataset_root): with open(f"{root}/v2_OpenEnded_mscoco_train2014_questions.json", "r") as fp: questions_train2014 = json.load(fp)["questions"] with open(f"{root}/v2_OpenEnded_mscoco_val2014_questions.json", "r") as fp: questions_val2014 = json.load(fp)["questions"] with open(f"{root}/v2_OpenEnded_mscoco_test2015_questions.json", "r") as fp: questions_test2015 = json.load(fp)["questions"] with open(f"{root}/v2_OpenEnded_mscoco_test-dev2015_questions.json", "r") as fp: questions_test_dev2015 = json.load(fp)["questions"] with open(f"{root}/v2_mscoco_train2014_annotations.json", "r") as fp: annotations_train2014 = json.load(fp)["annotations"] with open(f"{root}/v2_mscoco_val2014_annotations.json", "r") as fp: annotations_val2014 = json.load(fp)["annotations"] annotations = dict() # 一张图可能有多个question,这里聚合起来 for split, questions in zip( ["train", "val", "test", "test-dev"], [ questions_train2014, questions_val2014, questions_test2015, questions_test_dev2015, ], ): _annot = defaultdict(dict) for q in tqdm(questions): _annot[q["image_id"]][q["question_id"]] = [q["question"]] annotations[split] = _annot all_major_answers = list() # 把所有的答案拿到 for split, annots in zip( ["train", "val"], [annotations_train2014, annotations_val2014], ): _annot = annotations[split] for q in tqdm(annots): all_major_answers.append(q["multiple_choice_answer"]) all_major_answers = [ normalize_word(word) for word in tqdm(all_major_answers) ] counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 9} ans2label = {k: i for i, k in enumerate(counter.keys())} label2ans = list(counter.keys()) for split, annots in zip( ["train", "val"], [annotations_train2014, annotations_val2014], ): _annot = annotations[split] for q in tqdm(annots): answers = q["answers"] answer_count = {} for answer in answers: answer_ = answer["answer"] answer_count[answer_] = answer_count.get(answer_, 0) + 1 labels = [] scores = [] for answer in answer_count: if answer not in ans2label: continue labels.append(ans2label[answer]) score = get_score(answer_count[answer]) scores.append(score) _annot[q["image_id"]][q["question_id"]].append({ "labels": labels, "scores": scores, }) # _annot[q["image_id"]][q["question_id"]] = ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}] # 删除label=0的question for split in ["train", "val"]: filtered_annot = dict() for ik, iv in annotations[split].items(): # ik image_id # iv : {458752000: ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}], # 458752001: ['What position is this man playing?', {'labels': [1, 67], # 'scores': [1.0, 0.3]}], 458752002: ['What color is the players shirt?', # {'labels': [2], 'scores': [1.0]}], 458752003: ['Is this man a professional baseball player?', # {'labels': [3, 9], 'scores': [1.0, 0.3]}]} new_q = dict() for qk, qv in iv.items(): # qv : ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}] if len(qv[1]["labels"]) != 0: new_q[qk] = qv if len(new_q) != 0: filtered_annot[ik] = new_q annotations[split] = filtered_annot for split in [ "train", "val", "test", "test-dev", ]: annot = annotations[split] split_name = { "train": "train2014", "val": "val2014", "test": "test2015", "test-dev": "test2015", }[split] paths = list(glob(f"{root}/{split_name}/*.jpg")) random.shuffle(paths) annot_paths = [ path for path in paths if int(path.split("/")[-1].split("_")[-1][:-4]) in annot ] if len(paths) == len(annot_paths): print("all images have caption annotations") else: print("not all images have caption annotations") print( len(paths), len(annot_paths), len(annot), ) bs = [ path2rest(path, split, annotations, label2ans) for path in tqdm(annot_paths) ] dataframe = pd.DataFrame( bs, columns=[ "image", "questions", "answers", "answer_labels", "answer_scores", "image_id", "question_id", "split", ], ) table = pa.Table.from_pandas(dataframe) os.makedirs(dataset_root, exist_ok=True) with pa.OSFile(f"{dataset_root}/vqav2_{split}.arrow", "wb") as sink: with pa.RecordBatchFileWriter(sink, table.schema) as writer: writer.write_table(table) table = pa.ipc.RecordBatchFileReader( pa.memory_map(f"{dataset_root}/vqav2_val.arrow", "r")).read_all() pdtable = table.to_pandas() df1 = pdtable[:-1000] df2 = pdtable[-1000:] df1 = pa.Table.from_pandas(df1) df2 = pa.Table.from_pandas(df2) with pa.OSFile(f"{dataset_root}/vqav2_trainable_val.arrow", "wb") as sink: with pa.RecordBatchFileWriter(sink, df1.schema) as writer: writer.write_table(df1) with pa.OSFile(f"{dataset_root}/vqav2_rest_val.arrow", "wb") as sink: with pa.RecordBatchFileWriter(sink, df2.schema) as writer: writer.write_table(df2)
def test_native_file_open_error(): with assert_file_not_found(): pa.OSFile('non_existent_file', 'rb') with assert_file_not_found(): pa.memory_map('non_existent_file', 'rb')
def check_compressed_concatenated(data, fn, compression): raw = pa.OSFile(fn, mode="rb") with pa.CompressedInputStream(raw, compression) as compressed: got = compressed.read() assert got == data
pjoin(usenews_arrows19, "mediacloud2019.arrow"), pjoin(usenews_arrows20, "mediacloud2020.arrow"), ], "crowdtangle": [ pjoin(usenews_arrows19, "crowdtangle2019.arrow"), pjoin(usenews_arrows20, "crowdtangle2020.arrow"), ] } tables = {} for name, files in tables_files.items(): sub_tables = [] for file in files: source = pyarrow.memory_map(file, 'r') sub_tables.append(pyarrow.ipc.RecordBatchFileReader(source).read_all()) tables[name] = pyarrow.concat_tables(sub_tables) tables["crowdtangle"]["link"].map(urlnorm) tables["mediacloud"]["guid"].map(urlnorm) joined = pandas.merge( tables["crowdtangle"].groupby(["link"]).sum(), tables["mediacloud"], left_on="link", right_on="guid", ) with pyarrow.OSFile(sys.argv[3], 'wb') as sink: with pyarrow.RecordBatchFileWriter(sink, joined.schema) as writer: writer.write_table(joined)
def make_arrow(root, dataset_root): train_data = list( map(json.loads, open(f"{root}/nlvr2/data/train.json").readlines()) ) test1_data = list( map(json.loads, open(f"{root}/nlvr2/data/test1.json").readlines()) ) dev_data = list(map(json.loads, open(f"{root}/nlvr2/data/dev.json").readlines())) balanced_test1_data = list( map( json.loads, open(f"{root}/nlvr2/data/balanced/balanced_test1.json").readlines(), ) ) balanced_dev_data = list( map( json.loads, open(f"{root}/nlvr2/data/balanced/balanced_dev.json").readlines(), ) ) unbalanced_test1_data = list( map( json.loads, open(f"{root}/nlvr2/data/unbalanced/unbalanced_test1.json").readlines(), ) ) unbalanced_dev_data = list( map( json.loads, open(f"{root}/nlvr2/data/unbalanced/unbalanced_dev.json").readlines(), ) ) splits = [ "train", "dev", "test1", "balanced_dev", "balanced_test1", "unbalanced_dev", "unbalanced_test1", ] datas = [ train_data, dev_data, test1_data, balanced_dev_data, balanced_test1_data, unbalanced_dev_data, unbalanced_test1_data, ] annotations = dict() for split, data in zip(splits, datas): _annot = defaultdict(list) for row in tqdm(data): _annot["-".join(row["identifier"].split("-")[:-1])].append(row) annotations[split] = _annot for split in splits: bs = [ process(root, iden, row) for iden, row in tqdm(annotations[split].items()) ] dataframe = pd.DataFrame( bs, columns=["image_0", "image_1", "questions", "answers", "identifier"], ) table = pa.Table.from_pandas(dataframe) os.makedirs(dataset_root, exist_ok=True) with pa.OSFile(f"{dataset_root}/nlvr2_{split}.arrow", "wb") as sink: with pa.RecordBatchFileWriter(sink, table.schema) as writer: writer.write_table(table)
#!/usr/bin/env python import pyarrow as pa import numpy as np ndarray = np.random.randn(10, 6) print(ndarray) tensor = pa.Tensor.from_numpy(ndarray) with pa.OSFile("/tmp/tensor.arrow", "wb") as sink: pa.write_tensor(tensor, sink)
def table_to_bytes(table): global _temp_dir if _temp_dir is None or not os.path.exists(_temp_dir): _temp_dir = tempfile.mkdtemp(prefix='knime-python-') # Delete temporary directory upon Python shutdown. atexit.register(close) fd, path = tempfile.mkstemp(suffix='.dat', prefix='python-to-java-', dir=_temp_dir, text=False) try: os.close(fd) mp = pyarrow.default_memory_pool() col_arrays = [] col_names = [] all_names = [] missing_names = [] # add the index column to the list of columns all_names.append("__index_level_0__") if len(table._data_frame.index) > 0: col_names.append("__index_level_0__") col_arrays.append( pyarrow.Array.from_pandas(table._data_frame.index, type=to_pyarrow_type(_types_.STRING), memory_pool=mp)) else: missing_names.append("__index_level_0__") # Serialize the dataframe into a list of pyarrow.Array column by column for i in range(len(table._data_frame.columns)): # Do not allocate a buffer for columns that only contain missing values. We track and transfer their names # to give them special treatment on Java side. # This also covers tables of row count zero. if table._data_frame.iloc[:, i].isnull().all(): missing_names.append(table.get_name(i)) all_names.append(table.get_name(i)) continue # Convert collection types to binary if table.get_type(i) == _types_.INTEGER_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_list_generator( table._data_frame.iloc[:, i], '<i4'))) elif table.get_type(i) == _types_.LONG_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_list_generator( table._data_frame.iloc[:, i], '<i8'))) elif table.get_type(i) == _types_.DOUBLE_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_list_generator( table._data_frame.iloc[:, i], '<f8'))) elif table.get_type(i) == _types_.FLOAT_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_list_generator( table._data_frame.iloc[:, i], '<f4'))) elif table.get_type(i) == _types_.BOOLEAN_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_boolean_list_generator( table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.STRING_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_string_list_generator( table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.BYTES_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_bytes_list_generator( table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.INTEGER_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_set_generator(table._data_frame.iloc[:, i], '<i4'))) elif table.get_type(i) == _types_.LONG_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_set_generator(table._data_frame.iloc[:, i], '<i8'))) elif table.get_type(i) == _types_.DOUBLE_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_set_generator(table._data_frame.iloc[:, i], '<f8'))) elif table.get_type(i) == _types_.FLOAT_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_set_generator(table._data_frame.iloc[:, i], '<f4'))) elif table.get_type(i) == _types_.BOOLEAN_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_boolean_set_generator( table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.STRING_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_string_set_generator( table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.BYTES_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_bytes_set_generator( table._data_frame.iloc[:, i]))) # Workaround until numpy typecasts are implemented in pyarrow elif table.get_type( i ) == _types_.INTEGER and table._data_frame.iloc[:, i].dtype == np.int64: col_arrays.append( pyarrow.Array.from_pandas(np.array( table._data_frame.iloc[:, i], dtype=np.int32), memory_pool=mp)) # Workaround until fixed in pyarrow ... it is assumed that the first non-None object is bytearray if any elif table.get_type(i) == _types_.BYTES and type( get_first_not_None( table._data_frame.iloc[:, i])) == bytearray: col_arrays.append( pyarrow.Array.from_pandas(map( lambda x: x if x is None else bytes(x), table._data_frame.iloc[:, i]), memory_pool=mp)) # create pyarrow.Array else: pa_type = to_pyarrow_type(table.get_type(i)) # pyarrow.binary() type is not allowed as argument for type atm if pa_type == pyarrow.binary(): col_arrays.append( pyarrow.BinaryArray.from_pandas( table._data_frame.iloc[:, i], memory_pool=mp)) else: col_arrays.append( pyarrow.Array.from_pandas(table._data_frame.iloc[:, i], type=pa_type, memory_pool=mp)) col_names.append(table.get_name(i)) all_names.append(table.get_name(i)) # Construct metadata custom_metadata = { "index_columns": [all_names[0]], "columns": [{ "name": all_names[0], "metadata": { "serializer_id": "", "type_id": _types_.STRING } }], "missing_columns": missing_names, "num_rows": len(table._data_frame) } real_col_names = list(table._data_frame.columns) for name in all_names[1:]: col_idx = real_col_names.index(name) if table.get_type(col_idx) in [ _types_.BYTES, _types_.BYTES_LIST, _types_.BYTES_SET ]: custom_metadata['columns'].append({ "name": name, "metadata": { "serializer_id": table.get_column_serializers().get(name, ""), "type_id": table.get_type(col_idx) } }) else: custom_metadata['columns'].append({ "name": name, "metadata": { "serializer_id": "", "type_id": table.get_type(col_idx) } }) metadata = { b'ArrowSerializationLibrary': json.dumps(custom_metadata).encode('utf-8') } batch = pyarrow.RecordBatch.from_arrays(col_arrays, col_names) schema = batch.schema.remove_metadata() schema = schema.add_metadata(metadata) # Write data to file and return filepath with pyarrow.OSFile(path, 'wb') as f: stream_writer = pyarrow.RecordBatchStreamWriter(f, schema) stream_writer.write_batch(batch) stream_writer.close() return bytearray(path, 'utf-8') except BaseException: PythonUtils.invoke_safely(None, os.remove, [path]) raise
def deserialize_data_frame(path): global read_data_frame, read_types, read_serializers, _pandas_native_types_, path_to_mmap path_to_mmap = path with pyarrow.OSFile(path, 'rb') as f: stream_reader = pyarrow.RecordBatchStreamReader(f) arrowtable = stream_reader.read_all() # metadata pandas_metadata = json.loads( arrowtable.schema.metadata[b'pandas'].decode('utf-8')) names = [] for col in pandas_metadata['columns']: names.append(col['name']) read_types.append(col['metadata']['type_id']) ser_id = col['metadata']['serializer_id'] if ser_id != '': read_serializers[col['name']] = ser_id # data read_data_frame = pandas.DataFrame() for arrowcolumn in arrowtable.itercolumns(): typeidx = names.index(arrowcolumn.name) coltype = read_types[typeidx] if coltype in _pandas_native_types_: dfcol = arrowcolumn.to_pandas() else: if coltype == _types_.INTEGER_LIST or coltype == _types_.INTEGER_SET: dfcol = pandas.Series( collection_generator(arrowcolumn, coltype == _types_.INTEGER_SET, 4, 'i')) elif coltype == _types_.LONG_LIST or coltype == _types_.LONG_SET: dfcol = pandas.Series( collection_generator(arrowcolumn, coltype == _types_.LONG_SET, 8, 'q')) elif coltype == _types_.DOUBLE_LIST or coltype == _types_.DOUBLE_SET: dfcol = pandas.Series( collection_generator(arrowcolumn, coltype == _types_.DOUBLE_SET, 8, 'd')) elif coltype == _types_.FLOAT_LIST or coltype == _types_.FLOAT_SET: dfcol = pandas.Series( collection_generator(arrowcolumn, coltype == _types_.FLOAT_SET, 4, 'f')) elif coltype == _types_.BOOLEAN_LIST or coltype == _types_.BOOLEAN_SET: dfcol = pandas.Series( boolean_collection_generator( arrowcolumn, coltype == _types_.BOOLEAN_SET)) elif coltype == _types_.STRING_LIST or coltype == _types_.STRING_SET: dfcol = pandas.Series( string_collection_generator( arrowcolumn, coltype == _types_.STRING_SET)) elif coltype == _types_.BYTES_LIST or coltype == _types_.BYTES_SET: dfcol = pandas.Series( bytes_collection_generator( arrowcolumn, coltype == _types_.BYTES_SET)) else: raise KeyError('Type with id ' + str(coltype) + ' cannot be deserialized!') # Note: we only have one index column (the KNIME RowKeys) if arrowcolumn.name in pandas_metadata['index_columns']: indexcol = dfcol else: read_data_frame[arrowcolumn.name] = dfcol if not 'indexcol' in locals(): raise NameError( 'Variable indexcol has not been set properly, exiting!') if len(read_data_frame.columns) > 0: read_data_frame.set_index(keys=indexcol, inplace=True) else: read_data_frame = pandas.DataFrame(index=indexcol)
def memory_and_io_interfaces_example(): # pyarrow.Buffer. data = b"abcdefghijklmnopqrstuvwxyz" # Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the data bytes object. buf = pa.py_buffer(data) # External memory, under the form of a raw pointer and size, can also be referenced using the foreign_buffer() function. #buf = pa.foreign_buffer(data) print("buf = {}.".format(buf)) print("buf.size = {}.".format(buf.size)) print("memoryview(buf) = {}.".format(memoryview(buf))) print("buf.to_pybytes() = {}.".format(buf.to_pybytes())) #-------------------- # Memory pools. print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf = pa.allocate_buffer(1024, resizable=True) print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf.resize(2048) print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf = None print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) print("pa.default_memory_pool().backend_name = {}.".format(pa.default_memory_pool().backend_name)) #-------------------- # Input and output streams. buf = memoryview(b"some data") stream = pa.input_stream(buf) print("stream.read(4) = {}.".format(stream.read(4))) import gzip with gzip.open("./example.gz", "wb") as f: f.write(b"some data\n" * 3) stream = pa.input_stream("./example.gz") print("stream.read() = {}.".format(stream.read())) with pa.output_stream("./example1.dat") as stream: stream.write(b"some data") f = open("./example1.dat", "rb") print("f.read() = {}.".format(f.read())) #-------------------- # On-disk and memory mapped files. # Using regular Python. with open("./example2.dat", "wb") as f: f.write(b"some example data") file_obj = pa.OSFile("./example2.dat") print("file_obj.read(4) = {}.".format(file_obj.read(4))) # Using pyarrow's OSFile class. with pa.OSFile("./example3.dat", "wb") as f: f.write(b"some example data") mmap = pa.memory_map("./example3.dat") print("mmap.read(4) = {}.".format(mmap.read(4))) mmap.seek(0) buf = mmap.read_buffer(4) print("buf = {}.".format(buf)) print("buf.to_pybytes() = {}.".format(buf.to_pybytes())) #-------------------- # In-memory reading and writing. writer = pa.BufferOutputStream() writer.write(b"hello, friends") buf = writer.getvalue() print("buf = {}.".format(buf)) print("buf.size = {}.".format(buf.size)) reader = pa.BufferReader(buf) reader.seek(7) print("reader.read(7) = {}.".format(reader.read(7)))