Beispiel #1
0
def test_numpy_base_object(tmpdir):
    # ARROW-2040: deserialized Numpy array should keep a reference to the
    # owner of its memory
    path = os.path.join(str(tmpdir), 'zzz.bin')
    data = np.arange(12, dtype=np.int32)

    with open(path, 'wb') as f:
        f.write(pa.serialize(data).to_buffer())

    serialized = pa.read_serialized(pa.OSFile(path))
    result = serialized.deserialize()
    assert_equal(result, data)
    serialized = None
    assert_equal(result, data)
    assert result.base is not None
Beispiel #2
0
def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        pandas_fields = get_pandas_fields(table)
        arrow_fields = get_arrow_fields(table)
        arrow_schema = pa.schema(get_arrow_fields(table))
        column_names = [name for name, dtype in pandas_fields]
        date_cols = [
            name for name, dtype in arrow_fields if "timestamp" in dtype
        ]

        # Using both Arrow and Pandas allows each library to cover the other's current shortcomings.
        # Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes.
        # Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet.
        in_buf = pa.OSFile(str(extract_file), mode="r")
        reader = pa.CompressedInputStream(in_buf, compression="zstd")

        # Have to use snappy codec for Parquet because Drill doesn't read zstd
        parquet_writer = pq.ParquetWriter(parquet_file,
                                          schema=arrow_schema,
                                          compression='snappy',
                                          version="2.0",
                                          use_dictionary=True)
        df_iterator: TextFileReader = pd.read_csv(
            reader,
            header=None,
            names=column_names,
            dtype=dict(pandas_fields),
            true_values=map_to_bytes('T'),
            false_values=map_to_bytes('F'),
            chunksize=BUFFER_SIZE_ROWS,
            parse_dates=date_cols)

        chunked_write(df_iterator, parquet_writer, date_cols)
Beispiel #3
0
    def __init__(
        self,
        schema: Optional[pa.Schema] = None,
        features: Optional[Features] = None,
        path: Optional[str] = None,
        stream: Optional[pa.NativeFile] = None,
        fingerprint: Optional[str] = None,
        writer_batch_size: Optional[int] = None,
        disable_nullable: bool = False,
        update_features: bool = False,
        with_metadata: bool = True,
        unit: str = "examples",
    ):
        if path is None and stream is None:
            raise ValueError(
                "At least one of path and stream must be provided.")
        if features is not None:
            self._features = features
            self._schema = pa.schema(features.type)
        elif schema is not None:
            self._schema: pa.Schema = schema
            self._features = Features.from_arrow_schema(self._schema)
        else:
            self._features = None
            self._schema = None

        if disable_nullable and self._schema is not None:
            self._schema = pa.schema(
                pa.field(field.name, field.type, nullable=False)
                for field in self._schema)

        self._path = path
        if stream is None:
            self.stream = pa.OSFile(self._path, "wb")
        else:
            self.stream = stream

        self.fingerprint = fingerprint
        self.disable_nullable = disable_nullable
        self.writer_batch_size = writer_batch_size or DEFAULT_MAX_BATCH_SIZE
        self.update_features = update_features
        self.with_metadata = with_metadata
        self.unit = unit

        self._num_examples = 0
        self._num_bytes = 0
        self.current_rows = []
        self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
Beispiel #4
0
def export(dataset,
           path,
           column_names=None,
           byteorder="=",
           shuffle=False,
           selection=False,
           progress=None,
           virtual=True,
           sort=None,
           ascending=True):
    table = _export_table(dataset, column_names, byteorder, shuffle, selection,
                          progress, virtual, sort, ascending)
    b = table.to_batches()
    with pa.OSFile(path, 'wb') as sink:
        writer = pa.RecordBatchStreamWriter(sink, b[0].schema)
        writer.write_table(table)
Beispiel #5
0
def test_read_year_month_nano_interval(tmpdir):
    """ARROW-15783: Verify to_pandas works for interval types.

    Interval types require static structures to be enabled. This test verifies
    that they are when no other library functions are invoked.
    """
    mdn_interval_type = pa.month_day_nano_interval()
    schema = pa.schema([pa.field('nums', mdn_interval_type)])

    path = tmpdir.join('file.arrow').strpath
    with pa.OSFile(path, 'wb') as sink:
        with pa.ipc.new_file(sink, schema) as writer:
            interval_array = pa.array([(1, 2, 3)], type=mdn_interval_type)
            batch = pa.record_batch([interval_array], schema)
            writer.write(batch)
    invoke_script('read_record_batch.py', path)
def make_arrow(root, dataset_root):
    for split in ["val", "train"]:
        with open(f"{root}/{split}_annot.json", "r") as fp:
            captions = json.load(fp)

        iid2captions = dict()
        for cap in tqdm(captions):
            iid = cap[0].split("/")[-1]
            iid2captions[iid] = [cap[1]]

        paths = list(glob(f"{root}/images_{split}/*/*"))
        random.shuffle(paths)
        caption_paths = [
            path for path in paths if path.split("/")[-1] in iid2captions
        ]
        if len(paths) == len(caption_paths):
            print("all images have caption annotations")
        else:
            print("not all images have caption annotations")
        print(
            len(paths),
            len(caption_paths),
            len(iid2captions),
        )

        sub_len = int(len(caption_paths) // 100000)
        subs = list(range(sub_len + 1))
        for sub in subs:
            sub_paths = caption_paths[sub * 100000:(sub + 1) * 100000]
            bs = [path2rest(path, iid2captions) for path in tqdm(sub_paths)]
            dataframe = pd.DataFrame(
                bs,
                columns=["image", "caption", "image_id", "split"],
            )

            table = pa.Table.from_pandas(dataframe)

            os.makedirs(dataset_root, exist_ok=True)
            with pa.OSFile(
                    f"{dataset_root}/conceptual_caption_{split}_{sub}.arrow",
                    "wb") as sink:
                with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                    writer.write_table(table)
            del dataframe
            del table
            del bs
            gc.collect()
Beispiel #7
0
def open(path,
         mode='rb',
         fs_options={},
         fs=None,
         for_arrow=False,
         mmap=False,
         encoding="utf8"):
    if is_file_object(path):
        return path
    fs, path = parse(path, fs_options=fs_options, fs=fs, for_arrow=for_arrow)
    if fs is None:
        path = stringyfy(path)
        if for_arrow:
            if fs_options:
                raise ValueError(
                    f'fs_options not supported for local files. You passed: {repr(fs_options)}.'
                )
            if mmap:
                return pa.memory_map(path, mode)
            else:
                return pa.OSFile(path, mode)
        else:
            if 'b' not in mode:
                return normal_open(path, mode, encoding=encoding)
            else:
                return normal_open(path, mode)
    if mode == 'rb':

        def create():
            return fs.open_input_file(path)
    elif mode == "r":

        def create():
            return io.TextIOWrapper(fs.open_input_file(path),
                                    encoding=encoding)
    elif mode == 'wb':

        def create():
            return fs.open_output_stream(path)
    elif mode == "w":

        def create():
            return io.TextIOWrapper(fs.open_output_stream(path),
                                    encoding=encoding)
    else:
        raise ValueError(f'Only mode=rb/bw/r/w are supported, not {mode}')
    return FileProxy(create(), path, create)
Beispiel #8
0
    def dump_cache(self, cache_path: Optional[str] = None) -> str:
        """
        Saves this dataset at cache_path. Dumped datasets can be loaded with the
        DiskBackedDataset.load_cache static method. All fields contained in this
        dataset must be serializable using pickle.

        Parameters
        ----------
        cache_path: Optional[str]
            Path to the directory where the cache file will saved.
            The whole directory will be used as the cache and will be deleted
            when `delete_cache` is called. It is recommended to create a new
            directory to use exclusively as the cache, or to leave this as None.

            If None, a temporary directory will be created.

        Returns
        -------
        str
            The chosen cache directory path. Useful when cache_path is None and a
            temporary directory is created.
        """
        if cache_path == self.cache_path:
            raise ValueError(
                "Cache path same as datasets cache path. "
                f"Dataset can't overwrite its own cache. Cache path: {cache_path}"
            )

        if cache_path is None:
            cache_path = tempfile.mkdtemp(prefix=TEMP_CACHE_FILENAME_PREFIX)

        if not os.path.isdir(cache_path):
            os.mkdir(cache_path)

        # pickle fields
        cache_fields_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME)
        with open(cache_fields_path, "wb") as fields_cache_file:
            pickle.dump(self.fields, fields_cache_file)

        # dump table
        cache_table_path = os.path.join(cache_path, CACHE_TABLE_FILENAME)
        with pa.OSFile(cache_table_path, "wb") as f:
            with pa.RecordBatchFileWriter(f, self.table.schema) as writer:
                writer.write(self.table)

        return cache_path
Beispiel #9
0
    def __init__(
        self,
        data_type: Optional[pa.DataType] = None,
        schema: Optional[pa.Schema] = None,
        path: Optional[str] = None,
        stream: Optional[pa.NativeFile] = None,
        writer_batch_size: Optional[int] = None,
        disable_nullable: bool = True,
    ):
        if path is None and stream is None:
            raise ValueError(
                "At least one of path and stream must be provided.")

        if data_type is not None:
            self._type: pa.DataType = data_type
            self._schema: pa.Schema = pa.schema(field for field in self._type)
        elif schema is not None:
            self._schema: pa.Schema = schema
            self._type: pa.DataType = pa.struct(field
                                                for field in self._schema)
        else:
            self._schema = None
            self._type = None

        if disable_nullable and self._schema is not None:
            self._schema = pa.schema(
                pa.field(field.name, field.type, nullable=False)
                for field in self._type)
            self._type = pa.struct(
                pa.field(field.name, field.type, nullable=False)
                for field in self._type)

        self._path = path
        if stream is None:
            self.stream = pa.OSFile(self._path, "wb")
        else:
            self.stream = stream

        self.writer_batch_size = writer_batch_size or DEFAULT_MAX_BATCH_SIZE

        self._num_examples = 0
        self._num_bytes = 0
        self.current_rows = []

        self._build_writer(schema=self._schema)
    def write_backing_store_from_ensemble_dataframe(
        storage_dir: Path, storage_key: str, ensemble_df: pd.DataFrame
    ) -> None:

        table = pa.Table.from_pandas(ensemble_df, preserve_index=False)

        # The input DF may contain an ENSEMBLE column (which we'll drop before writing),
        # but it is probably an error if there is more than one unique value in it
        if "ENSEMBLE" in ensemble_df:
            if ensemble_df["ENSEMBLE"].nunique() > 1:
                raise KeyError("Input data contains more than one unique ensemble name")
            table = table.drop(["ENSEMBLE"])

        # Write to arrow format
        arrow_file_name: Path = storage_dir / (storage_key + ".arrow")
        with pa.OSFile(str(arrow_file_name), "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)
    def __init__(self, size_vocabulary, embeddings, path, is_train, n_classes=3,data_percentage=1.0):
        base_name = os.path.basename(path)
        if is_train:
           saved_input_filename = "%s/%s-%d-train.pkl" % (path, base_name, n_classes)
        else:
           saved_input_filename = "%s/%s-%d-test.pkl" % (path, base_name, n_classes)
        if os.path.exists(saved_input_filename): 
           input_file = open(saved_input_filename, 'rb')
           buf = input_file.read()
           all_data_node_id, all_data_node_type = pyarrow.deserialize(buf)
           input_file.close()
        else:
           all_data_node_id, all_data_node_type = load_program_graphs_from_directory(path, is_train,n_classes,data_percentage)
           all_data_node_id = np.array(all_data_node_id)[0:len(all_data_node_id)]
           all_data_node_type = np.array(all_data_node_type)[0:len(all_data_node_type)]
           buf = pyarrow.serialize((all_data_node_id, all_data_node_type)).to_buffer()
           out = pyarrow.OSFile(saved_input_filename, 'wb')
           out.write(buf)
           out.close()
        
        self.pretrained_embeddings = embeddings
        # print(all_data_node_id)
        if is_train == True:
            print("Number of all training data : " + str(len(all_data_node_id)))
        else:
            print("Number of all testing data : " + str(len(all_data_node_id)))
        self.n_edge_types =  find_max_edge_id(all_data_node_id)
        # print("Edge types : " + str(self.n_edge_types))
        max_node_id = find_max_node_id(all_data_node_id)
        max_node_type = find_max_node_id(all_data_node_type)
        print("Max node id : " + str(max_node_id))
        print("Max node type : " + str(max_node_type))
        # self.n_node = size_vocabulary
        self.n_node_by_id = max_node_id
        self.n_node = max_node_id # set n_node = n_node_by_id
        self.n_node_by_type = max_node_type
        
        all_data_node_id = convert_program_data(all_data_node_id,1, self.n_node_by_id)
        all_data_node_type = convert_program_data(all_data_node_type,1, self.n_node_by_type)

        self.all_data_node_id = all_data_node_id
        self.all_data_node_type = all_data_node_type
        
        self.data = all_data_node_id
def make_arrow(root, dataset_root):
    with open(f"{root}/karpathy/dataset_flickr30k.json", "r") as fp:
        captions = json.load(fp)

    captions = captions["images"]

    iid2captions = defaultdict(list)
    iid2split = dict()

    for cap in tqdm(captions):
        filename = cap["filename"]
        iid2split[filename] = cap["split"]
        for c in cap["sentences"]:
            iid2captions[filename].append(c["raw"])

    paths = list(glob(f"{root}/flickr30k-images/*.jpg"))
    random.shuffle(paths)
    caption_paths = [path for path in paths if path.split("/")[-1] in iid2captions]

    if len(paths) == len(caption_paths):
        print("all images have caption annotations")
    else:
        print("not all images have caption annotations")
    print(
        len(paths), len(caption_paths), len(iid2captions),
    )

    bs = [path2rest(path, iid2captions, iid2split) for path in tqdm(caption_paths)]

    for split in ["train", "val", "test"]:
        batches = [b for b in bs if b[-1] == split]

        dataframe = pd.DataFrame(
            batches, columns=["image", "caption", "image_id", "split"],
        )

        table = pa.Table.from_pandas(dataframe)

        os.makedirs(dataset_root, exist_ok=True)
        with pa.OSFile(
            f"{dataset_root}/f30k_caption_karpathy_{split}.arrow", "wb"
        ) as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)
Beispiel #13
0
def test_arrow_chunk(scidb_con, url):
    prefix = 'arrow_chunk'
    url = '{}/{}'.format(url, prefix)
    schema = '<v:int64> [i=0:999:0:1000]'

    # Store
    # if url.startswith('s3://'):
    scidb_con.iquery("""
xsave(
  build({}, i),
  '{}')""".format(schema, url))

    # Re-write one SciDB Chunk file to use multiple Arrow Chunks
    if url.startswith('s3://'):
        s3_key = '{}/{}/chunks/c_0'.format(base_prefix, prefix)
        obj = s3_con.get_object(Bucket=s3_bucket, Key=s3_key)
        reader = pyarrow.ipc.open_stream(obj['Body'].read())
    elif url.startswith('file://'):
        fn = '{}/{}/chunks/c_0'.format(fs_base, prefix)
        reader = pyarrow.open_stream(pyarrow.OSFile(fn))

    tbl = reader.read_all()

    if url.startswith('s3://'):
        sink = pyarrow.BufferOutputStream()
        writer = pyarrow.ipc.RecordBatchStreamWriter(sink, tbl.schema)
    elif url.startswith('file://'):
        writer = pyarrow.ipc.RecordBatchStreamWriter(fn, tbl.schema)

    batches = tbl.to_batches(max_chunksize=200)  # 1000 / 200 = 5 chunks
    writer.write_table(pyarrow.Table.from_batches(batches))
    writer.close()

    if url.startswith('s3://'):
        s3_con.put_object(Body=sink.getvalue().to_pybytes(),
                          Bucket=s3_bucket,
                          Key=s3_key)

    # Input
    que = "xinput('{}')".format(url)

    with pytest.raises(requests.exceptions.HTTPError):
        array = scidb_con.iquery(que, fetch=True)
Beispiel #14
0
def open_for_arrow(path, mode='rb', fs_options={}, mmap=False):
    '''When the file will be passed to arrow, we want file object arrow likes.

    This might avoid peformance issues with GIL, or call overhead.
    '''
    import pyarrow as pa
    if is_file_object(path):
        return path
    path = stringyfy(path)
    scheme, _ = split_scheme(path)
    if scheme is None:
        if fs_options:
            raise ValueError(f'fs_options not supported for local files. You passed: {repr(fs_options)}.')
        if mmap:
            return pa.memory_map(path, mode)
        else:
            return pa.OSFile(path, mode)
    else:
        return open(path, mode=mode, fs_options=fs_options).file
Beispiel #15
0
    def __init__(self,
                 data_type: Optional[pa.DataType] = None,
                 schema: Optional[pa.Schema] = None,
                 path: Optional[str] = None,
                 stream: Optional[pa.NativeFile] = None,
                 writer_batch_size: Optional[int] = None,
                 disable_nullable: bool = True):
        if data_type is None and schema is None:
            raise ValueError(
                "At least one of data_type and schema must be provided.")
        if path is None and stream is None:
            raise ValueError(
                "At least one of path and stream must be provided.")

        if data_type is not None:
            self._type: pa.DataType = data_type
            self._schema: pa.Schema = pa.schema(field for field in self._type)
        else:
            self._schema: pa.Schema = schema
            self._type: pa.DataType = pa.struct(field
                                                for field in self._schema)

        if disable_nullable:
            self._schema = pa.schema(
                pa.field(field.name, field.type, nullable=False)
                for field in self._type)
            self._type = pa.struct(
                pa.field(field.name, field.type, nullable=False)
                for field in self._type)

        self._path = path
        if stream is None:
            self.stream = pa.OSFile(self._path, 'wb')
        else:
            self.stream = stream

        self.writer = pa.RecordBatchStreamWriter(self.stream, self._schema)
        self.writer_batch_size = writer_batch_size

        self._num_examples = 0
        self._num_bytes = 0
        self.current_rows = []
Beispiel #16
0
def _output_to_disk(obj: pa.Buffer, full_path: str,
                    serialization: Serialization) -> None:
    """Outputs a serialized object to disk to the specified path.

    Args:
        obj: Object to output to disk.
        full_path: Full path to save the data to.
        serialization: Serialization of the `obj`. For possible values
            see :class:`Serialization`.

    Raises:
        ValueError: If the specified serialization is not valid.
    """
    if isinstance(serialization, Serialization):
        with pa.OSFile(f"{full_path}.{serialization.name}", "wb") as f:
            f.write(obj)
    else:
        raise ValueError("Function not defined for specified 'serialization'")

    return
Beispiel #17
0
def test_numpy_matrix_serialization(tmpdir):
    class CustomType(object):
        def __init__(self, val):
            self.val = val

    path = os.path.join(str(tmpdir), 'pyarrow_npmatrix_serialization_test.bin')
    array = np.random.randint(low=-1, high=1, size=(2, 2))

    for data_type in [str, int, float, CustomType]:
        matrix = np.matrix(array.astype(data_type))

        with open(path, 'wb') as f:
            f.write(pa.serialize(matrix).to_buffer())

        serialized = pa.read_serialized(pa.OSFile(path))
        result = serialized.deserialize()
        assert_equal(result, matrix)
        assert_equal(result.dtype, matrix.dtype)
        serialized = None
        assert_equal(result, matrix)
        assert result.base is not None
Beispiel #18
0
def to_arrow(file, arrow_file, chunksize=2000000, crs=None, **kwargs):
    """ Converts a spatial vector file into an arrow binary file.
    In case of CSV it uses pyarrow CSV reader, otherwise GDAL is used in order
    to parse the file.
    Parameters:
        file (string): The input file full path.
        arrow_file (string): The full path of output arrow file.
        chunksize (int): The chunksize of the file that is read in each iteration (default: 2000000)
        crs (string): The native CRS of the spatial file (optional)
    """
    with pa.OSFile(arrow_file, 'wb') as sink:
        writer = None
        for table in to_arrow_table(file,
                                    chunksize=chunksize,
                                    crs=crs,
                                    **kwargs):
            b = table.to_batches()
            if writer is None:
                writer = pa.RecordBatchStreamWriter(sink, b[0].schema)
            writer.write_table(table)
    sink.close()
Beispiel #19
0
def test_put_arrow(test_client):
    """
    Convert a simple json object to arrow, and write it to the datastore.
    """
    jdf = [{"a":1,"b":10},{"a":2,"b":20}]
    buf = json_to_arrow(jdf)
    assert(isinstance(buf, bytes))
    cell_hash = "test3"
    frame_name = str(uuid.uuid4())
    response = test_client.put('/{}/{}'.format(cell_hash, frame_name),data=buf,
                               content_type='application/octet-stream')
    assert(response.status_code == 200)
    assert(os.path.exists('{}/{}/{}'.format(storage_backend.store.dirname,
                                            cell_hash,
                                            frame_name)))
    f = pa.OSFile('{}/{}/{}'.format(storage_backend.store.dirname,
                                    cell_hash,
                                    frame_name))
    buf = f.read_buffer(10000)
    new_jdf = json.loads(arrow_to_json(buf))
    assert(new_jdf == jdf)
def test_native_file_raises_ValueError_after_close(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='rb') as os_file:
        assert not os_file.closed
    assert os_file.closed

    with pa.memory_map(path, mode='rb') as mmap_file:
        assert not mmap_file.closed
    assert mmap_file.closed

    files = [os_file, mmap_file]

    methods = [('tell', ()), ('seek', (0, )), ('size', ()), ('flush', ()),
               ('readable', ()), ('writable', ()), ('seekable', ())]

    for f in files:
        for method, args in methods:
            with pytest.raises(ValueError):
                getattr(f, method)(*args)
Beispiel #21
0
def make_arrow(root, dataset_root):
    with open(f"{root}/annotations/region_descriptions.json", "r") as fp:
        captions = json.load(fp)

    iid2captions = defaultdict(list)
    for cap in tqdm(captions):
        cap = cap["regions"]
        for c in cap:
            iid2captions[c["image_id"]].append(c)

    paths = list(glob(f"{root}/images/VG_100K/*.jpg")) + list(
        glob(f"{root}/images/VG_100K_2/*.jpg"))
    random.shuffle(paths)
    caption_paths = [
        path for path in paths if int(path.split("/")[-1][:-4]) in iid2captions
    ]

    if len(paths) == len(caption_paths):
        print("all images have caption annotations")
    else:
        print("not all images have caption annotations")
    print(
        len(paths),
        len(caption_paths),
        len(iid2captions),
    )

    bs = [path2rest(path, iid2captions) for path in tqdm(caption_paths)]
    dataframe = pd.DataFrame(
        bs,
        columns=["image", "caption", "width", "height", "x", "y", "image_id"],
    )
    table = pa.Table.from_pandas(dataframe)

    os.makedirs(dataset_root, exist_ok=True)
    with pa.OSFile(f"{dataset_root}/vg.arrow", "wb") as sink:
        with pa.RecordBatchFileWriter(sink, table.schema) as writer:
            writer.write_table(table)
Beispiel #22
0
def make_arrow(root, dataset_root):
    with open(f"{root}/v2_OpenEnded_mscoco_train2014_questions.json",
              "r") as fp:
        questions_train2014 = json.load(fp)["questions"]
    with open(f"{root}/v2_OpenEnded_mscoco_val2014_questions.json", "r") as fp:
        questions_val2014 = json.load(fp)["questions"]
    with open(f"{root}/v2_OpenEnded_mscoco_test2015_questions.json",
              "r") as fp:
        questions_test2015 = json.load(fp)["questions"]
    with open(f"{root}/v2_OpenEnded_mscoco_test-dev2015_questions.json",
              "r") as fp:
        questions_test_dev2015 = json.load(fp)["questions"]

    with open(f"{root}/v2_mscoco_train2014_annotations.json", "r") as fp:
        annotations_train2014 = json.load(fp)["annotations"]
    with open(f"{root}/v2_mscoco_val2014_annotations.json", "r") as fp:
        annotations_val2014 = json.load(fp)["annotations"]

    annotations = dict()

    # 一张图可能有多个question,这里聚合起来
    for split, questions in zip(
        ["train", "val", "test", "test-dev"],
        [
            questions_train2014,
            questions_val2014,
            questions_test2015,
            questions_test_dev2015,
        ],
    ):
        _annot = defaultdict(dict)
        for q in tqdm(questions):
            _annot[q["image_id"]][q["question_id"]] = [q["question"]]

        annotations[split] = _annot

    all_major_answers = list()
    # 把所有的答案拿到
    for split, annots in zip(
        ["train", "val"],
        [annotations_train2014, annotations_val2014],
    ):
        _annot = annotations[split]
        for q in tqdm(annots):
            all_major_answers.append(q["multiple_choice_answer"])

    all_major_answers = [
        normalize_word(word) for word in tqdm(all_major_answers)
    ]
    counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 9}
    ans2label = {k: i for i, k in enumerate(counter.keys())}
    label2ans = list(counter.keys())

    for split, annots in zip(
        ["train", "val"],
        [annotations_train2014, annotations_val2014],
    ):
        _annot = annotations[split]
        for q in tqdm(annots):
            answers = q["answers"]
            answer_count = {}
            for answer in answers:
                answer_ = answer["answer"]
                answer_count[answer_] = answer_count.get(answer_, 0) + 1

            labels = []
            scores = []
            for answer in answer_count:
                if answer not in ans2label:
                    continue
                labels.append(ans2label[answer])
                score = get_score(answer_count[answer])
                scores.append(score)

            _annot[q["image_id"]][q["question_id"]].append({
                "labels": labels,
                "scores": scores,
            })

    # _annot[q["image_id"]][q["question_id"]] = ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}]
    # 删除label=0的question
    for split in ["train", "val"]:
        filtered_annot = dict()
        for ik, iv in annotations[split].items():
            # ik image_id
            # iv : {458752000: ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}],
            # 458752001: ['What position is this man playing?', {'labels': [1, 67],
            # 'scores': [1.0, 0.3]}], 458752002: ['What color is the players shirt?',
            # {'labels': [2], 'scores': [1.0]}], 458752003: ['Is this man a professional baseball player?',
            # {'labels': [3, 9], 'scores': [1.0, 0.3]}]}
            new_q = dict()
            for qk, qv in iv.items():
                # qv : ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}]
                if len(qv[1]["labels"]) != 0:
                    new_q[qk] = qv
            if len(new_q) != 0:
                filtered_annot[ik] = new_q
        annotations[split] = filtered_annot

    for split in [
            "train",
            "val",
            "test",
            "test-dev",
    ]:
        annot = annotations[split]
        split_name = {
            "train": "train2014",
            "val": "val2014",
            "test": "test2015",
            "test-dev": "test2015",
        }[split]
        paths = list(glob(f"{root}/{split_name}/*.jpg"))
        random.shuffle(paths)
        annot_paths = [
            path for path in paths
            if int(path.split("/")[-1].split("_")[-1][:-4]) in annot
        ]

        if len(paths) == len(annot_paths):
            print("all images have caption annotations")
        else:
            print("not all images have caption annotations")
        print(
            len(paths),
            len(annot_paths),
            len(annot),
        )

        bs = [
            path2rest(path, split, annotations, label2ans)
            for path in tqdm(annot_paths)
        ]

        dataframe = pd.DataFrame(
            bs,
            columns=[
                "image",
                "questions",
                "answers",
                "answer_labels",
                "answer_scores",
                "image_id",
                "question_id",
                "split",
            ],
        )

        table = pa.Table.from_pandas(dataframe)

        os.makedirs(dataset_root, exist_ok=True)
        with pa.OSFile(f"{dataset_root}/vqav2_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)

    table = pa.ipc.RecordBatchFileReader(
        pa.memory_map(f"{dataset_root}/vqav2_val.arrow", "r")).read_all()

    pdtable = table.to_pandas()

    df1 = pdtable[:-1000]
    df2 = pdtable[-1000:]

    df1 = pa.Table.from_pandas(df1)
    df2 = pa.Table.from_pandas(df2)

    with pa.OSFile(f"{dataset_root}/vqav2_trainable_val.arrow", "wb") as sink:
        with pa.RecordBatchFileWriter(sink, df1.schema) as writer:
            writer.write_table(df1)

    with pa.OSFile(f"{dataset_root}/vqav2_rest_val.arrow", "wb") as sink:
        with pa.RecordBatchFileWriter(sink, df2.schema) as writer:
            writer.write_table(df2)
Beispiel #23
0
def test_native_file_open_error():
    with assert_file_not_found():
        pa.OSFile('non_existent_file', 'rb')
    with assert_file_not_found():
        pa.memory_map('non_existent_file', 'rb')
Beispiel #24
0
def check_compressed_concatenated(data, fn, compression):
    raw = pa.OSFile(fn, mode="rb")
    with pa.CompressedInputStream(raw, compression) as compressed:
        got = compressed.read()
        assert got == data
Beispiel #25
0
        pjoin(usenews_arrows19, "mediacloud2019.arrow"),
        pjoin(usenews_arrows20, "mediacloud2020.arrow"),
    ],
    "crowdtangle": [
        pjoin(usenews_arrows19, "crowdtangle2019.arrow"),
        pjoin(usenews_arrows20, "crowdtangle2020.arrow"),
    ]
}
tables = {}

for name, files in tables_files.items():
    sub_tables = []
    for file in files:
        source = pyarrow.memory_map(file, 'r')
        sub_tables.append(pyarrow.ipc.RecordBatchFileReader(source).read_all())
    tables[name] = pyarrow.concat_tables(sub_tables)

tables["crowdtangle"]["link"].map(urlnorm)
tables["mediacloud"]["guid"].map(urlnorm)

joined = pandas.merge(
    tables["crowdtangle"].groupby(["link"]).sum(),
    tables["mediacloud"],
    left_on="link",
    right_on="guid",
)

with pyarrow.OSFile(sys.argv[3], 'wb') as sink:
    with pyarrow.RecordBatchFileWriter(sink, joined.schema) as writer:
        writer.write_table(joined)
Beispiel #26
0
def make_arrow(root, dataset_root):
    train_data = list(
        map(json.loads, open(f"{root}/nlvr2/data/train.json").readlines())
    )
    test1_data = list(
        map(json.loads, open(f"{root}/nlvr2/data/test1.json").readlines())
    )
    dev_data = list(map(json.loads, open(f"{root}/nlvr2/data/dev.json").readlines()))

    balanced_test1_data = list(
        map(
            json.loads,
            open(f"{root}/nlvr2/data/balanced/balanced_test1.json").readlines(),
        )
    )
    balanced_dev_data = list(
        map(
            json.loads,
            open(f"{root}/nlvr2/data/balanced/balanced_dev.json").readlines(),
        )
    )

    unbalanced_test1_data = list(
        map(
            json.loads,
            open(f"{root}/nlvr2/data/unbalanced/unbalanced_test1.json").readlines(),
        )
    )
    unbalanced_dev_data = list(
        map(
            json.loads,
            open(f"{root}/nlvr2/data/unbalanced/unbalanced_dev.json").readlines(),
        )
    )

    splits = [
        "train",
        "dev",
        "test1",
        "balanced_dev",
        "balanced_test1",
        "unbalanced_dev",
        "unbalanced_test1",
    ]

    datas = [
        train_data,
        dev_data,
        test1_data,
        balanced_dev_data,
        balanced_test1_data,
        unbalanced_dev_data,
        unbalanced_test1_data,
    ]

    annotations = dict()

    for split, data in zip(splits, datas):
        _annot = defaultdict(list)
        for row in tqdm(data):
            _annot["-".join(row["identifier"].split("-")[:-1])].append(row)
        annotations[split] = _annot

    for split in splits:
        bs = [
            process(root, iden, row) for iden, row in tqdm(annotations[split].items())
        ]

        dataframe = pd.DataFrame(
            bs, columns=["image_0", "image_1", "questions", "answers", "identifier"],
        )

        table = pa.Table.from_pandas(dataframe)

        os.makedirs(dataset_root, exist_ok=True)
        with pa.OSFile(f"{dataset_root}/nlvr2_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)
Beispiel #27
0
#!/usr/bin/env python

import pyarrow as pa
import numpy as np

ndarray = np.random.randn(10, 6)
print(ndarray)
tensor = pa.Tensor.from_numpy(ndarray)
with pa.OSFile("/tmp/tensor.arrow", "wb") as sink:
    pa.write_tensor(tensor, sink)
Beispiel #28
0
def table_to_bytes(table):
    global _temp_dir
    if _temp_dir is None or not os.path.exists(_temp_dir):
        _temp_dir = tempfile.mkdtemp(prefix='knime-python-')
        # Delete temporary directory upon Python shutdown.
        atexit.register(close)
    fd, path = tempfile.mkstemp(suffix='.dat',
                                prefix='python-to-java-',
                                dir=_temp_dir,
                                text=False)
    try:
        os.close(fd)

        mp = pyarrow.default_memory_pool()
        col_arrays = []
        col_names = []
        all_names = []
        missing_names = []

        # add the index column to the list of columns
        all_names.append("__index_level_0__")
        if len(table._data_frame.index) > 0:
            col_names.append("__index_level_0__")
            col_arrays.append(
                pyarrow.Array.from_pandas(table._data_frame.index,
                                          type=to_pyarrow_type(_types_.STRING),
                                          memory_pool=mp))
        else:
            missing_names.append("__index_level_0__")

        # Serialize the dataframe into a list of pyarrow.Array column by column
        for i in range(len(table._data_frame.columns)):
            # Do not allocate a buffer for columns that only contain missing values. We track and transfer their names
            # to give them special treatment on Java side.
            # This also covers tables of row count zero.
            if table._data_frame.iloc[:, i].isnull().all():
                missing_names.append(table.get_name(i))
                all_names.append(table.get_name(i))
                continue
            # Convert collection types to binary
            if table.get_type(i) == _types_.INTEGER_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_list_generator(
                            table._data_frame.iloc[:, i], '<i4')))
            elif table.get_type(i) == _types_.LONG_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_list_generator(
                            table._data_frame.iloc[:, i], '<i8')))
            elif table.get_type(i) == _types_.DOUBLE_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_list_generator(
                            table._data_frame.iloc[:, i], '<f8')))
            elif table.get_type(i) == _types_.FLOAT_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_list_generator(
                            table._data_frame.iloc[:, i], '<f4')))
            elif table.get_type(i) == _types_.BOOLEAN_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_boolean_list_generator(
                            table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.STRING_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_string_list_generator(
                            table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.BYTES_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_bytes_list_generator(
                            table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.INTEGER_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_set_generator(table._data_frame.iloc[:, i],
                                                  '<i4')))
            elif table.get_type(i) == _types_.LONG_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_set_generator(table._data_frame.iloc[:, i],
                                                  '<i8')))
            elif table.get_type(i) == _types_.DOUBLE_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_set_generator(table._data_frame.iloc[:, i],
                                                  '<f8')))
            elif table.get_type(i) == _types_.FLOAT_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_set_generator(table._data_frame.iloc[:, i],
                                                  '<f4')))
            elif table.get_type(i) == _types_.BOOLEAN_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_boolean_set_generator(
                            table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.STRING_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_string_set_generator(
                            table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.BYTES_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_bytes_set_generator(
                            table._data_frame.iloc[:, i])))
            # Workaround until numpy typecasts are implemented in pyarrow
            elif table.get_type(
                    i
            ) == _types_.INTEGER and table._data_frame.iloc[:,
                                                            i].dtype == np.int64:
                col_arrays.append(
                    pyarrow.Array.from_pandas(np.array(
                        table._data_frame.iloc[:, i], dtype=np.int32),
                                              memory_pool=mp))
            # Workaround until fixed in pyarrow ... it is assumed that the first non-None object is bytearray if any
            elif table.get_type(i) == _types_.BYTES and type(
                    get_first_not_None(
                        table._data_frame.iloc[:, i])) == bytearray:
                col_arrays.append(
                    pyarrow.Array.from_pandas(map(
                        lambda x: x if x is None else bytes(x),
                        table._data_frame.iloc[:, i]),
                                              memory_pool=mp))
            # create pyarrow.Array
            else:
                pa_type = to_pyarrow_type(table.get_type(i))
                # pyarrow.binary() type is not allowed as argument for type atm
                if pa_type == pyarrow.binary():
                    col_arrays.append(
                        pyarrow.BinaryArray.from_pandas(
                            table._data_frame.iloc[:, i], memory_pool=mp))
                else:
                    col_arrays.append(
                        pyarrow.Array.from_pandas(table._data_frame.iloc[:, i],
                                                  type=pa_type,
                                                  memory_pool=mp))
            col_names.append(table.get_name(i))
            all_names.append(table.get_name(i))

        # Construct metadata
        custom_metadata = {
            "index_columns": [all_names[0]],
            "columns": [{
                "name": all_names[0],
                "metadata": {
                    "serializer_id": "",
                    "type_id": _types_.STRING
                }
            }],
            "missing_columns":
            missing_names,
            "num_rows":
            len(table._data_frame)
        }

        real_col_names = list(table._data_frame.columns)
        for name in all_names[1:]:
            col_idx = real_col_names.index(name)
            if table.get_type(col_idx) in [
                    _types_.BYTES, _types_.BYTES_LIST, _types_.BYTES_SET
            ]:
                custom_metadata['columns'].append({
                    "name": name,
                    "metadata": {
                        "serializer_id":
                        table.get_column_serializers().get(name, ""),
                        "type_id":
                        table.get_type(col_idx)
                    }
                })
            else:
                custom_metadata['columns'].append({
                    "name": name,
                    "metadata": {
                        "serializer_id": "",
                        "type_id": table.get_type(col_idx)
                    }
                })

        metadata = {
            b'ArrowSerializationLibrary':
            json.dumps(custom_metadata).encode('utf-8')
        }

        batch = pyarrow.RecordBatch.from_arrays(col_arrays, col_names)

        schema = batch.schema.remove_metadata()
        schema = schema.add_metadata(metadata)

        # Write data to file and return filepath
        with pyarrow.OSFile(path, 'wb') as f:
            stream_writer = pyarrow.RecordBatchStreamWriter(f, schema)
            stream_writer.write_batch(batch)
            stream_writer.close()
        return bytearray(path, 'utf-8')
    except BaseException:
        PythonUtils.invoke_safely(None, os.remove, [path])
        raise
Beispiel #29
0
def deserialize_data_frame(path):
    global read_data_frame, read_types, read_serializers, _pandas_native_types_, path_to_mmap
    path_to_mmap = path
    with pyarrow.OSFile(path, 'rb') as f:
        stream_reader = pyarrow.RecordBatchStreamReader(f)
        arrowtable = stream_reader.read_all()
        # metadata
        pandas_metadata = json.loads(
            arrowtable.schema.metadata[b'pandas'].decode('utf-8'))
        names = []
        for col in pandas_metadata['columns']:
            names.append(col['name'])
            read_types.append(col['metadata']['type_id'])
            ser_id = col['metadata']['serializer_id']
            if ser_id != '':
                read_serializers[col['name']] = ser_id

        # data
        read_data_frame = pandas.DataFrame()
        for arrowcolumn in arrowtable.itercolumns():
            typeidx = names.index(arrowcolumn.name)
            coltype = read_types[typeidx]
            if coltype in _pandas_native_types_:
                dfcol = arrowcolumn.to_pandas()
            else:
                if coltype == _types_.INTEGER_LIST or coltype == _types_.INTEGER_SET:
                    dfcol = pandas.Series(
                        collection_generator(arrowcolumn,
                                             coltype == _types_.INTEGER_SET, 4,
                                             'i'))
                elif coltype == _types_.LONG_LIST or coltype == _types_.LONG_SET:
                    dfcol = pandas.Series(
                        collection_generator(arrowcolumn,
                                             coltype == _types_.LONG_SET, 8,
                                             'q'))
                elif coltype == _types_.DOUBLE_LIST or coltype == _types_.DOUBLE_SET:
                    dfcol = pandas.Series(
                        collection_generator(arrowcolumn,
                                             coltype == _types_.DOUBLE_SET, 8,
                                             'd'))
                elif coltype == _types_.FLOAT_LIST or coltype == _types_.FLOAT_SET:
                    dfcol = pandas.Series(
                        collection_generator(arrowcolumn,
                                             coltype == _types_.FLOAT_SET, 4,
                                             'f'))
                elif coltype == _types_.BOOLEAN_LIST or coltype == _types_.BOOLEAN_SET:
                    dfcol = pandas.Series(
                        boolean_collection_generator(
                            arrowcolumn, coltype == _types_.BOOLEAN_SET))
                elif coltype == _types_.STRING_LIST or coltype == _types_.STRING_SET:
                    dfcol = pandas.Series(
                        string_collection_generator(
                            arrowcolumn, coltype == _types_.STRING_SET))
                elif coltype == _types_.BYTES_LIST or coltype == _types_.BYTES_SET:
                    dfcol = pandas.Series(
                        bytes_collection_generator(
                            arrowcolumn, coltype == _types_.BYTES_SET))
                else:
                    raise KeyError('Type with id ' + str(coltype) +
                                   ' cannot be deserialized!')
            # Note: we only have one index column (the KNIME RowKeys)
            if arrowcolumn.name in pandas_metadata['index_columns']:
                indexcol = dfcol
            else:
                read_data_frame[arrowcolumn.name] = dfcol

        if not 'indexcol' in locals():
            raise NameError(
                'Variable indexcol has not been set properly, exiting!')

        if len(read_data_frame.columns) > 0:
            read_data_frame.set_index(keys=indexcol, inplace=True)
        else:
            read_data_frame = pandas.DataFrame(index=indexcol)
Beispiel #30
0
def memory_and_io_interfaces_example():
	# pyarrow.Buffer.

	data = b"abcdefghijklmnopqrstuvwxyz"

	# Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the data bytes object.
	buf = pa.py_buffer(data)
	# External memory, under the form of a raw pointer and size, can also be referenced using the foreign_buffer() function.
	#buf = pa.foreign_buffer(data)

	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	print("memoryview(buf) = {}.".format(memoryview(buf)))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# Memory pools.

	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = pa.allocate_buffer(1024, resizable=True)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf.resize(2048)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = None
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	print("pa.default_memory_pool().backend_name = {}.".format(pa.default_memory_pool().backend_name))

	#--------------------
	# Input and output streams.

	buf = memoryview(b"some data")
	stream = pa.input_stream(buf)

	print("stream.read(4) = {}.".format(stream.read(4)))

	import gzip
	with gzip.open("./example.gz", "wb") as f:
		f.write(b"some data\n" * 3)

	stream = pa.input_stream("./example.gz")
	print("stream.read() = {}.".format(stream.read()))

	with pa.output_stream("./example1.dat") as stream:
		stream.write(b"some data")

	f = open("./example1.dat", "rb")
	print("f.read() = {}.".format(f.read()))

	#--------------------
	# On-disk and memory mapped files.

	# Using regular Python.
	with open("./example2.dat", "wb") as f:
		f.write(b"some example data")

	file_obj = pa.OSFile("./example2.dat")
	print("file_obj.read(4) = {}.".format(file_obj.read(4)))

	# Using pyarrow's OSFile class.
	with pa.OSFile("./example3.dat", "wb") as f:
		f.write(b"some example data")

	mmap = pa.memory_map("./example3.dat")
	print("mmap.read(4) = {}.".format(mmap.read(4)))

	mmap.seek(0)
	buf = mmap.read_buffer(4)
	print("buf = {}.".format(buf))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# In-memory reading and writing.

	writer = pa.BufferOutputStream()
	writer.write(b"hello, friends")
	buf = writer.getvalue()
	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	reader = pa.BufferReader(buf)
	reader.seek(7)
	print("reader.read(7) = {}.".format(reader.read(7)))