コード例 #1
0
ファイル: test_feather.py プロジェクト: StevenMPhillips/arrow
    def test_overwritten_file(self):
        path = random_path()

        num_values = 100
        np.random.seed(0)

        values = np.random.randint(0, 10, size=num_values)
        write_feather(pd.DataFrame({'ints': values}), path)

        df = pd.DataFrame({'ints': values[0: num_values//2]})
        self._check_pandas_roundtrip(df, path=path)
コード例 #2
0
ファイル: test_feather.py プロジェクト: sunchao/arrow
    def test_filelike_objects(self):
        from io import BytesIO

        buf = BytesIO()

        # the copy makes it non-strided
        df = pd.DataFrame(np.arange(12).reshape(4, 3),
                          columns=['a', 'b', 'c']).copy()
        write_feather(df, buf)

        buf.seek(0)

        result = read_feather(buf)
        assert_frame_equal(result, df)
コード例 #3
0
ファイル: test_feather.py プロジェクト: emkornfield/arrow
def test_chunked_binary_error_message():
    # ARROW-3058: As Feather does not yet support chunked columns, we at least
    # make sure it's clear to the user what is going on

    # 2^31 + 1 bytes
    values = [b'x'] + [
        b'x' * (1 << 20)
    ] * 2 * (1 << 10)
    df = pd.DataFrame({'byte_col': values})

    with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum "
                       "capacity of a Feather binary column. This restriction "
                       "may be lifted in the future"):
        write_feather(df, io.BytesIO())
コード例 #4
0
ファイル: test_feather.py プロジェクト: StevenMPhillips/arrow
    def test_delete_partial_file_on_error(self):
        # strings will fail
        df = pd.DataFrame(
            {
                'numbers': range(5),
                'strings': [b'foo', None, u'bar', 'qux', np.nan]},
            columns=['numbers', 'strings'])

        path = random_path()
        try:
            write_feather(df, path)
        except:
            pass

        assert not os.path.exists(path)
コード例 #5
0
ファイル: test_feather.py プロジェクト: emkornfield/arrow
    def test_num_columns_attr(self):
        df0 = pd.DataFrame({})
        df1 = pd.DataFrame({
            'foo': [1, 2, 3, 4, 5]
        })
        df2 = pd.DataFrame({
            'foo': [1, 2, 3, 4, 5],
            'bar': [1, 2, 3, 4, 5]
        })
        for df, ncols in zip([df0, df1, df2], [0, 1, 2]):
            path = random_path()
            self.test_files.append(path)
            write_feather(df, path)

            reader = FeatherReader(path)
            assert reader.num_columns == ncols
コード例 #6
0
ファイル: test_feather.py プロジェクト: sunchao/arrow
    def test_num_rows_attr(self):
        df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]})
        path = random_path()
        self.test_files.append(path)
        write_feather(df, path)

        reader = FeatherReader(path)
        assert reader.num_rows == len(df)

        df = pd.DataFrame({})
        path = random_path()
        self.test_files.append(path)
        write_feather(df, path)

        reader = FeatherReader(path)
        assert reader.num_rows == 0
コード例 #7
0
def to_feather(df, path):
    """
    Write a DataFrame to the feather-format

    Parameters
    ----------
    df : DataFrame
    path : string file path, or file-like object

    """
    path = _stringify_path(path)
    if not isinstance(df, DataFrame):
        raise ValueError("feather only support IO with DataFrames")

    feather = _try_import()[0]
    valid_types = {'string', 'unicode'}

    # validate index
    # --------------

    # validate that we have only a default index
    # raise on anything else as we don't serialize the index

    if not isinstance(df.index, Int64Index):
        raise ValueError("feather does not support serializing {} "
                         "for the index; you can .reset_index()"
                         "to make the index into column(s)".format(
                             type(df.index)))

    if not df.index.equals(RangeIndex.from_range(range(len(df)))):
        raise ValueError("feather does not support serializing a "
                         "non-default index for the index; you "
                         "can .reset_index() to make the index "
                         "into column(s)")

    if df.index.name is not None:
        raise ValueError("feather does not serialize index meta-data on a "
                         "default index")

    # validate columns
    # ----------------

    # must have value column names (strings only)
    if df.columns.inferred_type not in valid_types:
        raise ValueError("feather must have string column names")

    feather.write_feather(df, path)
コード例 #8
0
    def test_delete_partial_file_on_error(self):
        if sys.platform == 'win32':
            pytest.skip('Windows hangs on to file handle for some reason')

        # strings will fail
        df = pd.DataFrame(
            {
                'numbers': range(5),
                'strings': [b'foo', None, u'bar', 'qux', np.nan]},
            columns=['numbers', 'strings'])

        path = random_path()
        try:
            write_feather(df, path)
        except Exception:
            pass

        assert not os.path.exists(path)
コード例 #9
0
def test_boolean_nulls(version):
    # pandas requires upcast to object dtype
    path = random_path()
    TEST_FILES.append(path)

    num_values = 100
    np.random.seed(0)

    mask = np.random.randint(0, 10, size=num_values) < 3
    values = np.random.randint(0, 10, size=num_values) < 5

    table = pa.table([pa.array(values, mask=mask)], names=['bools'])
    write_feather(table, path, version=version)

    expected = values.astype(object)
    expected[mask] = None

    ex_frame = pd.DataFrame({'bools': expected})

    result = read_feather(path)
    assert_frame_equal(result, ex_frame)
コード例 #10
0
    def test_delete_partial_file_on_error(self):
        if sys.platform == 'win32':
            pytest.skip('Windows hangs on to file handle for some reason')

        class CustomClass(object):
            pass

        # strings will fail
        df = pd.DataFrame(
            {
                'numbers': range(5),
                'strings': [b'foo', None, u'bar', CustomClass(), np.nan]},
            columns=['numbers', 'strings'])

        path = random_path()
        try:
            write_feather(df, path)
        except Exception:
            pass

        assert not os.path.exists(path)
コード例 #11
0
def test_read_table(version):
    num_values = (100, 100)
    path = random_path()

    TEST_FILES.append(path)

    values = np.random.randint(0, 100, size=num_values)
    columns = ['col_' + str(i) for i in range(100)]
    table = pa.Table.from_arrays(values, columns)

    write_feather(table, path, version=version)

    result = read_table(path)
    assert result.equals(table)

    # Test without memory mapping
    result = read_table(path, memory_map=False)
    assert result.equals(table)

    result = read_feather(path, memory_map=False)
    assert_frame_equal(table.to_pandas(), result)
コード例 #12
0
def test_dataset(version):
    num_values = (100, 100)
    num_files = 5
    paths = [random_path() for i in range(num_files)]
    data = {
        "col_" + str(i): np.random.randn(num_values[0])
        for i in range(num_values[1])
    }
    table = pa.table(data)

    TEST_FILES.extend(paths)
    for index, path in enumerate(paths):
        rows = (
            index * (num_values[0] // num_files),
            (index + 1) * (num_values[0] // num_files),
        )

        write_feather(table[rows[0]: rows[1]], path, version=version)

    data = FeatherDataset(paths).read_table()
    assert data.equals(table)
コード例 #13
0
ファイル: test_feather.py プロジェクト: StevenMPhillips/arrow
    def _check_pandas_roundtrip(self, df, expected=None, path=None,
                                columns=None, null_counts=None):
        if path is None:
            path = random_path()

        self.test_files.append(path)
        write_feather(df, path)
        if not os.path.exists(path):
            raise Exception('file not written')

        result = read_feather(path, columns)
        if expected is None:
            expected = df

        assert_frame_equal(result, expected)

        if null_counts is None:
            null_counts = np.zeros(len(expected.columns))

        np.testing.assert_array_equal(self._get_null_counts(path, columns),
                                      null_counts)
コード例 #14
0
def root2disk(fileName, chunkSize=1000000):
    """
    A function to convert input ROOT file into parquet and feather formats for later 
    faster input/output from disk.

    Parameters
    ----------
    - fileName : string
       Specifies location and name of the file

    - chunkSize : int
       A part of the whole sample that is read and processed at a time. 
    
    Raises
    ------
    - Nothing. Used to have TypeError if no parameters are given. 
    
    Returns
    -------
    - Void

    """

    if not isinstance(fileName, str) and not isinstance(chunkSize, int):
        raise TypeError(
            "Please specify both fileName and chunkSize parameters! Exiting.")

    #pbar = ProgressBar()
    count = 1
    #for df in pbar(read_root(filename, chunksize=oneMillion)):
    print 'Processing >>'
    for df in read_root(paths=fileName, chunksize=chunkSize):
        print '>>' * count
        feather.write_feather(df, 'ndf_{0}.feather'.format(count))
        df.to_parquet('ndf_{0}.parquet'.format(count),
                      engine='fastparquet',
                      compression='gzip')
        count += 1
        if count > 100:
            break
コード例 #15
0
def export_file(obj, filename, extension, flag=None):
    """Export a valid object to file"""
    if extension == ".gri" and isinstance(obj, xtgeo.RegularSurface):
        obj.to_file(filename, fformat="irap_binary")
    elif extension == ".csv" and isinstance(obj,
                                            (xtgeo.Polygons, xtgeo.Points)):
        out = obj.copy()  # to not modify incoming instance!
        if "xtgeo" not in flag:
            out.xname = "X"
            out.yname = "Y"
            out.zname = "Z"
            if isinstance(out, xtgeo.Polygons):
                # out.pname = "ID"  not working
                out.dataframe.rename(columns={out.pname: "ID"}, inplace=True)
        out.dataframe.to_csv(filename, index=False)
    elif extension == ".pol" and isinstance(obj,
                                            (xtgeo.Polygons, xtgeo.Points)):
        obj.to_file(filename)
    elif extension == ".segy" and isinstance(obj, xtgeo.Cube):
        obj.to_file(filename, fformat="segy")
    elif extension == ".roff" and isinstance(obj,
                                             (xtgeo.Grid, xtgeo.GridProperty)):
        obj.to_file(filename, fformat="roff")
    elif extension == ".csv" and isinstance(obj, pd.DataFrame):
        includeindex = True if flag == "include_index" else False
        obj.to_csv(filename, index=includeindex)
    elif extension == ".arrow" and HAS_PYARROW and isinstance(obj, pa.Table):
        # comment taken from equinor/webviz_subsurface/smry2arrow.py

        # Writing here is done through the feather import, but could also be done using
        # pa.RecordBatchFileWriter.write_table() with a few pa.ipc.IpcWriteOptions(). It
        # is convenient to use feather since it has ready configured defaults and the
        # actual file format is the same
        # (https://arrow.apache.org/docs/python/feather.html)
        feather.write_feather(obj, dest=filename)
    else:
        raise TypeError(
            f"Exporting {extension} for {type(obj)} is not supported")

    return str(filename)
コード例 #16
0
def test_v2_compression_options():
    df = pd.DataFrame({'A': np.arange(1000)})

    cases = [
        # compression, compression_level
        ('uncompressed', None),
        ('lz4', None),
        ('lz4', 1),
        ('lz4', 12),
        ('zstd', 1),
        ('zstd', 10)
    ]

    for compression, compression_level in cases:
        _check_pandas_roundtrip(df, compression=compression,
                                compression_level=compression_level)

    buf = io.BytesIO()

    # Trying to compress with V1
    with pytest.raises(
            ValueError,
            match="Feather V1 files do not support compression option"):
        write_feather(df, buf, compression='lz4', version=1)

    # Trying to set chunksize with V1
    with pytest.raises(
            ValueError,
            match="Feather V1 files do not support chunksize option"):
        write_feather(df, buf, chunksize=4096, version=1)

    # Unsupported compressor
    with pytest.raises(ValueError,
                       match='compression="snappy" not supported'):
        write_feather(df, buf, compression='snappy')
コード例 #17
0
ファイル: test_feather.py プロジェクト: zzzeddy/arrow
def test_read_table(version):
    num_values = (100, 100)
    path = random_path()

    TEST_FILES.append(path)

    values = np.random.randint(0, 100, size=num_values)

    df = pd.DataFrame(values, columns=['col_' + str(i)
                                       for i in range(100)])
    write_feather(df, path, version=version)

    data = pd.DataFrame(values,
                        columns=['col_' + str(i) for i in range(100)])
    table = pa.Table.from_pandas(data)

    result = read_table(path)
    assert_frame_equal(table.to_pandas(), result.to_pandas())

    # Test without memory mapping
    result = read_table(path, memory_map=False)
    assert_frame_equal(table.to_pandas(), result.to_pandas())
コード例 #18
0
ファイル: test_arrow.py プロジェクト: deeprtc/io
  def test_arrow_feather_dataset(self):
    """test_arrow_feather_dataset"""
    # Feather files currently do not support columns of list types
    truth_data = TruthData(self.scalar_data, self.scalar_dtypes,
                           self.scalar_shapes)

    batch = self.make_record_batch(truth_data)
    df = batch.to_pandas()

    # Create a tempfile that is deleted after tests run
    with tempfile.NamedTemporaryFile(delete=False) as f:
      write_feather(df, f)

    # test single file
    dataset = arrow_io.ArrowFeatherDataset(
        f.name,
        list(range(len(truth_data.output_types))),
        truth_data.output_types,
        truth_data.output_shapes)
    self.run_test_case(dataset, truth_data)

    # test multiple files
    dataset = arrow_io.ArrowFeatherDataset(
        [f.name, f.name],
        list(range(len(truth_data.output_types))),
        truth_data.output_types,
        truth_data.output_shapes)
    truth_data_doubled = TruthData(
        [d * 2 for d in truth_data.data],
        truth_data.output_types,
        truth_data.output_shapes)
    self.run_test_case(dataset, truth_data_doubled)

    # test construction from schema
    dataset = arrow_io.ArrowFeatherDataset.from_schema(
        f.name, batch.schema)
    self.run_test_case(dataset, truth_data)

    os.unlink(f.name)
コード例 #19
0
    def test_feather_chunked(self):
        from pyarrow.feather import write_feather

        x = np.arange(10).reshape(5, 2)
        s = TensorArray(x)
        df1 = pd.DataFrame({"i": list(range(len(s))), "tensor": s})

        # Create a Table with 2 chunks
        table1 = pa.Table.from_pandas(df1)
        df2 = df1.copy()
        df2["tensor"] = df2["tensor"] * 10
        table2 = pa.Table.from_pandas(df2)
        table = pa.concat_tables([table1, table2])
        self.assertEqual(table.column("tensor").num_chunks, 2)

        # Write table to feather and read back as a DataFrame
        with tempfile.TemporaryDirectory() as dirpath:
            filename = os.path.join(dirpath, "tensor_array_chunked_test.feather")
            write_feather(table, filename)
            df_read = pd.read_feather(filename)
            df_expected = pd.concat([df1, df2]).reset_index(drop=True)
            pd.testing.assert_frame_equal(df_expected, df_read)
コード例 #20
0
ファイル: s3painfree.py プロジェクト: kunal-kotian/access3
def save_df_s3(df, bucket_name, filepath, filetype='feather'):
    """ Save df to filepath on the S3 bucket in the specified format.
    Supported formats: feather, pickle.

    Parameters:
        df: pandas dataframe to be saved
        bucket_name: name of the S3 bucket
        filepath: path of the saved location on S3, including the filename
                  (relative to the S3 bucket's home dir)
    Example: Save the df 'allmysecrets' as a feather file 'topsecret.feather'
    save_df_s3(df=allmysecrets, bucket_name='bucketymcbucket', 
              'data/topsecret.feather')
    """
    s3_resource = boto3.resource('s3')
    with BytesIO() as f:
        if filetype == 'feather':
            write_feather(df, f)
            s3_resource.Object(bucket_name, filepath).put(Body=f.getvalue())

        elif filetype == 'pickle':
            pickle.dump(df, f)
            s3_resource.Object(bucket_name, filepath).put(Body=f.getvalue())
コード例 #21
0
ファイル: test_feather.py プロジェクト: zhijunfu/arrow
    def _check_pandas_roundtrip(self, df, expected=None, path=None,
                                columns=None, null_counts=None,
                                nthreads=1):
        if path is None:
            path = random_path()

        self.test_files.append(path)
        write_feather(df, path)
        if not os.path.exists(path):
            raise Exception('file not written')

        result = read_feather(path, columns, nthreads=nthreads)
        if expected is None:
            expected = df

        assert_frame_equal(result, expected)

        if null_counts is None:
            null_counts = np.zeros(len(expected.columns))

        np.testing.assert_array_equal(self._get_null_counts(path, columns),
                                      null_counts)
コード例 #22
0
ファイル: test_feather.py プロジェクト: zeroshade/arrow
def _check_pandas_roundtrip(df, expected=None, path=None,
                            columns=None, use_threads=False,
                            version=None, compression=None,
                            compression_level=None):
    if path is None:
        path = random_path()

    if version is None:
        version = 2

    TEST_FILES.append(path)
    write_feather(df, path, compression=compression,
                  compression_level=compression_level, version=version)

    if not os.path.exists(path):
        raise Exception('file not written')

    result = read_feather(path, columns, use_threads=use_threads)

    if expected is None:
        expected = df

    assert_frame_equal(result, expected)
コード例 #23
0
def test_use_threads(version):
    # ARROW-14470
    num_values = (10, 10)
    path = random_path()

    TEST_FILES.append(path)

    values = np.random.randint(0, 10, size=num_values)
    columns = ['col_' + str(i) for i in range(10)]
    table = pa.Table.from_arrays(values, columns)

    write_feather(table, path, version=version)

    result = read_feather(path)
    assert_frame_equal(table.to_pandas(), result)

    # Test read_feather with use_threads=False
    result = read_feather(path, use_threads=False)
    assert_frame_equal(table.to_pandas(), result)

    # Test read_table with use_threads=False
    result = read_table(path, use_threads=False)
    assert result.equals(table)
コード例 #24
0
ファイル: test_arrow.py プロジェクト: kyamagu/io
    def test_arrow_list_feather_columns(self):
        """test_arrow_list_feather_columns"""
        import tensorflow_io.arrow as arrow_io

        from pyarrow.feather import write_feather

        # Feather files currently do not support columns of list types
        truth_data = TruthData(self.scalar_data, self.scalar_dtypes,
                               self.scalar_shapes)

        batch = self.make_record_batch(truth_data)
        df = batch.to_pandas()

        # Create a tempfile that is deleted after tests run
        with tempfile.NamedTemporaryFile(delete=False) as f:
            write_feather(df, f, version=1)

        # test single file
        # prefix "file://" to test scheme file system (e.g., s3, gcs, azfs, ignite)
        columns = arrow_io.list_feather_columns("file://" + f.name)
        for name, dtype in list(zip(batch.schema.names, batch.schema.types)):
            assert columns[name].name == name
            assert columns[name].dtype == dtype
            assert columns[name].shape == [4]

        # test memory
        with open(f.name, "rb") as ff:
            memory = ff.read()
        # when memory is provided filename doesn't matter:
        columns = arrow_io.list_feather_columns("file:///non_exist",
                                                memory=memory)
        for name, dtype in list(zip(batch.schema.names, batch.schema.types)):
            assert columns[name].name == name
            assert columns[name].dtype == dtype
            assert columns[name].shape == [4]

        os.unlink(f.name)
コード例 #25
0
def convert2feather(fname: str,
                    out_folder: str,
                    name: str,
                    extension: str = "feather") -> str:
    """
    Convert a whole genome rankings database to a feather format based database.

    More information on this format can be found here:
    .. feather-format: https://blog.rstudio.com/2016/03/29/feather/

    :param fname: The filename of the legacy
    :param out_folder: The name of the folder to write the new database to.
    :param name: The name of the rankings database.
    :param extension: The extension of the new database file.
    :return: The filename of the new database.
    """
    assert os.path.isfile(fname), "{} does not exist.".format(fname)
    assert os.path.isdir(out_folder), "{} is not a directory.".format(
        out_folder)

    feather_fname = os.path.join(
        out_folder, "{}.{}".format(
            os.path.splitext(os.path.basename(fname))[0], extension))
    assert not os.path.exists(feather_fname), "{} already exists.".format(
        feather_fname)

    # Load original database into memory.
    # Caveat: the original storage format of whole genome rankings does not store the metadata, i.e. name.
    db = SQLiteRankingDatabase(fname=fname, name=name)
    df = db.load_full()
    df.index.name = INDEX_NAME
    df.reset_index(
        inplace=True
    )  # Index is not stored in feather format. https://github.com/wesm/feather/issues/200
    write_feather(df, feather_fname)
    return feather_fname
コード例 #26
0
def test_feather_format(tempdir):
    from pyarrow.feather import write_feather

    table = pa.table({
        'a': pa.array([1, 2, 3], type="int8"),
        'b': pa.array([.1, .2, .3], type="float64")
    })

    basedir = tempdir / "feather_dataset"
    basedir.mkdir()
    write_feather(table, str(basedir / "data.feather"))

    dataset = ds.dataset(basedir, format=ds.IpcFileFormat())
    result = dataset.to_table()
    assert result.equals(table)

    dataset = ds.dataset(basedir, format="feather")
    result = dataset.to_table()
    assert result.equals(table)

    # error with Feather v1 files
    write_feather(table, str(basedir / "data1.feather"), version=1)
    with pytest.raises(ValueError):
        ds.dataset(basedir, format="feather").to_table()
コード例 #27
0
def count_ligs(chunk):
    print(type(chunk))
    print(chunk.schema)
    table_batch = chunk
    smiles = list(table_batch.column('smiles'))
    names = list(table_batch.column('zinc_id'))
    fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect
    pars = {
        "radius": 2,
        "nBits": 8192,
        "invariants": [],
        "fromAtoms": [],
        "useChirality": False,
        "useBondTypes": True,
        "useFeatures": True,
    }

    fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect
    print(f'the number of smiles in the record batch is {len(smiles)}')
    count_ligs = len(smiles)
    smiles = [x for x in smiles]
    named_mols = []
    for count, m in enumerate(smiles):
        mol = Chem.MolFromSmiles(str(m))
        molid = names[count]
        mol.SetProp('_Name', str(molid))
        named_mols.append(mol)
    record_batches = gen_fp(named_mols, pars, fingerprint_function)
    tab = gen_fp(named_mols, pars, fingerprint_function)
    target_directory = '/data/dockop_data/feathers_zinc_15'
    name = os.path.join(target_directory, 'zinc_15_subset%s' + '.feather')
    unique_feather = next_path(name)
    with open(unique_feather, 'wb') as f:
        feather.write_feather(tab, f)
    print(f'The feather was written to {unique_feather}')
    return unique_feather
コード例 #28
0
    def storeModel(self):
        if os.path.isdir(self.brain_path):
            shutil.rmtree(self.brain_path)
        
        os.makedirs(self.brain_path)

        file_path_nodes = os.path.join(self.brain_path, 'hidden.feather')

        for i in range(len(self.weights)):
            file_path_weights = os.path.join(self.brain_path, f'weights{i}.feather')
            file_path_biases = os.path.join(self.brain_path, f'biases{i}.feather')
            feather.write_feather(pd.DataFrame(self.weights[i]), file_path_weights)
            feather.write_feather(pd.DataFrame(self.biases[i]), file_path_biases)
        
        feather.write_feather(pd.DataFrame(self.hidden_nodes), file_path_nodes)
コード例 #29
0
ファイル: benchmark.py プロジェクト: ursa-labs/notebooks
    def bench_write(self, niter=2):
        print("Reading text file: {}".format(self.csv_path))
        df = pd.read_csv(self.csv_path,
                         sep=self.sep,
                         header=self.header,
                         low_memory=False)
        if self.header is None:
            df.columns = ['f{}'.format(i) for i in range(len(df.columns))]

        def _get_table(df):
            return (pa.Table.from_pandas(
                df, preserve_index=False).replace_schema_metadata(None))

        t = _get_table(df)

        cases = [
            ('parquet (UNC)', 'arrow Table', lambda: pq.write_table(
                t, self.parquet_unc_path, compression='NONE')),
            ('parquet (UNC)', 'pandas', lambda: pq.write_table(
                _get_table(df), self.parquet_unc_path, compression='NONE')),
            ('parquet (SNAPPY)', 'arrow Table',
             lambda: pq.write_table(t, self.parquet_snappy_path)),
            ('parquet (SNAPPY)', 'pandas',
             lambda: pq.write_table(_get_table(df), self.parquet_snappy_path)),
            ('feather V2 (UNC)', 'pandas', lambda: feather.write_feather(
                df, self.feather_unc_path, compression='uncompressed')),
            ('feather V2 (UNC)', 'arrow Table', lambda: feather.write_feather(
                t, self.feather_unc_path, compression='uncompressed')),
            ('feather V2 (LZ4)', 'pandas', lambda: feather.write_feather(
                df, self.feather_lz4_path, compression='lz4')),
            ('feather V2 (LZ4)', 'arrow Table', lambda: feather.write_feather(
                t, self.feather_lz4_path, compression='lz4')),
            ('feather V2 (ZSTD)', 'pandas', lambda: feather.write_feather(
                df, self.feather_zstd_path, compression='zstd')),
            ('feather V2 (ZSTD)', 'arrow Table', lambda: feather.write_feather(
                t, self.feather_zstd_path, compression='zstd'))
        ]

        return self._bench_cases(cases, niter)
コード例 #30
0
 def to_feather(self,expandCategory,expandTime,preprocessType,seperateLabels,chunksize):
     """
     to_feather transform Time_Series_Data or Time_Series_Data_Collection
     to feather file
     
     Parameters
     ----------
     expandCategory : bool
         whether to expand category
     expandTime : bool
         whether to expand time
     preprocessType : ['ignore','pad','remove']
         preprocess data time across categories
     seperateLabels : bool
         whether to seperate labels and data
     chunksize : int
         size of feather file
     """
     if seperateLabels ==False:
         table = to_arrow_table(
             time_series = self.time_series,
             expandCategory = expandCategory,
             expandTime= expandTime,
             preprocessType = preprocessType,
             seperateLabels = seperateLabels
             )
         pf.write_feather(table,self.dirPaths,version = self.version,chunksize=chunksize)
         return
     table, label_table = to_arrow_table(
             time_series = self.time_series,
             expandCategory = expandCategory,
             expandTime= expandTime,
             preprocessType = preprocessType,
             seperateLabels = seperateLabels
             )
     pf.write_feather(table,self.dirPaths[0],version = self.version,chunksize=chunksize)
     pf.write_feather(label_table,self.dirPaths[1],version = self.version,chunksize=chunksize)
コード例 #31
0
 def f():
     write_feather(df, path)
コード例 #32
0
        indices = duplicateRowsDF.index
        names = duplicateRowsDF['names']

        dupenum = 2
        for count,indexid in enumerate(indices):
            print(df.loc[indexid,'names'])
            new_value = f'{names[indexid]}_{str(dupenum)}'
            print(new_value)
            df.at[indexid,'names'] = new_value
    return df


molchunk_path = '/data/dopamine_3_results/mol_chunk_docking/mol_chunks_test_firstreal_first400.molchunk'

autodock_gpu = '/home/schrogpu/ADFRsuite-1.0/AutoDock-GPU/bin/autodock_gpu_128wi'
receptor_path = '/home/schrogpu/ADFRsuite-1.0/pocket2_fixer_moreatoms/rigidReceptor.maps.fld'
lsmet = 'sw'
num_runs = 50
col_to_dock = 'pdbqt_block_am1bcc'
# col_to_dock = 'pdbqt_gast_list'
working_dir = '/data/dopamine_3_results/mol_chunk_docking/first400d3'
df = df_from_molchunk(molchunk_path)
df = remove_duplicates(df)
mols_to_pymol(df)
outdf = run_autodock_gpu(df, col_to_dock, autodock_gpu, lsmet, num_runs, working_dir, receptor_path)
indexed_df = index_docking_output(outdf)
final_df = extract_specific_pdbqt(indexed_df, num_runs, working_dir)
show_docked(final_df)
del final_df['ROMol']
feather.write_feather(final_df, '/data/dopamine_3_results/mol_chunk_docking/mol_chunks_test_firstreal_first400_out.molchunk')
コード例 #33
0
ファイル: feather_format.py プロジェクト: zbrookle/pandas
def to_feather(
    df: DataFrame,
    path: FilePathOrBuffer[AnyStr],
    storage_options: StorageOptions = None,
    **kwargs,
):
    """
    Write a DataFrame to the binary Feather format.

    Parameters
    ----------
    df : DataFrame
    path : string file path, or file-like object
    storage_options : dict, optional
        Extra options that make sense for a particular storage connection, e.g.
        host, port, username, password, etc., if using a URL that will
        be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
        will be raised if providing this argument with a local path or
        a file-like buffer. See the fsspec and backend storage implementation
        docs for the set of allowed keys and values.

        .. versionadded:: 1.2.0

    **kwargs :
        Additional keywords passed to `pyarrow.feather.write_feather`.

        .. versionadded:: 1.1.0
    """
    import_optional_dependency("pyarrow")
    from pyarrow import feather

    ioargs = get_filepath_or_buffer(path, mode="wb", storage_options=storage_options)

    if not isinstance(df, DataFrame):
        raise ValueError("feather only support IO with DataFrames")

    valid_types = {"string", "unicode"}

    # validate index
    # --------------

    # validate that we have only a default index
    # raise on anything else as we don't serialize the index

    if not isinstance(df.index, Int64Index):
        typ = type(df.index)
        raise ValueError(
            f"feather does not support serializing {typ} "
            "for the index; you can .reset_index() to make the index into column(s)"
        )

    if not df.index.equals(RangeIndex.from_range(range(len(df)))):
        raise ValueError(
            "feather does not support serializing a non-default index for the index; "
            "you can .reset_index() to make the index into column(s)"
        )

    if df.index.name is not None:
        raise ValueError(
            "feather does not serialize index meta-data on a default index"
        )

    # validate columns
    # ----------------

    # must have value column names (strings only)
    if df.columns.inferred_type not in valid_types:
        raise ValueError("feather must have string column names")

    feather.write_feather(df, ioargs.filepath_or_buffer, **kwargs)

    if ioargs.should_close:
        assert not isinstance(ioargs.filepath_or_buffer, str)
        ioargs.filepath_or_buffer.close()
コード例 #34
0
ファイル: test_feather.py プロジェクト: sunchao/arrow
 def f():
     write_feather(df, path)
コード例 #35
0
                # call sentiment function
                sentiment_over_time(0,lang,date_range_input,file,subfolder)     
            
            
            
# load the created csv with the missing data    
df_final = pd.read_csv(r"C:\Users\simon\Desktop\WS_20_21\DS_12\test.csv")

# reset index
df_final = df_final.reset_index()

# rename columns
df_final = df_final.rename(columns={"level_0":"language","level_1":"retweet_count",
                         "level_2":"date","level_3":"company","level_4":"sentiment_mean",
                         "level_5":"sentiment_weight_retweet","level_6":"sentiment_weight_length",
                         "Unnamed: 0":"sentiment_weight_likes"})

# append dataframe to the original dataframe
df_full = df_full.append(df_final)            

# sort values
df_full = df_full.sort_values(by=['company','language','retweet_count'])

# write a feather file with the complete dataframe
feather.write_feather(df_full, r"C:\Users\simon\Desktop\WS_20_21\DS_12\test_feather")   




コード例 #36
0
 def f():
     write_feather(df, path, version=version)
コード例 #37
0
def convert_to_cache(my_cccc, input_file_list, out_dir, checksum_arrow_file, out_list_file, tmp_grib_file, conf_list, debug):
    warno = 189
    checksum_df = feather.read_feather(checksum_arrow_file)
    checksum_list = []
    now = datetime.utcnow()
    for in_file in input_file_list:
        if debug:
            print('Debug', ':', 'in_file =', in_file, file=sys.stderr)
        with open(in_file, 'rb') as in_file_stream:
            batch_type = 0
            message_length = 0
            start_byte4 = None
            start_char4 = None
            try:
                start_byte4 = in_file_stream.read(4)
                if len(start_byte4) < 4:
                    break
                start_char4 = start_byte4.decode()
            except:
                print('Warning', warno, ':', 'The start 4 bytes of', in_file, 'are not strings.', file=sys.stderr)
            while start_char4:
                if debug:
                    print('Debug', ':', 'start_char4 =', start_char4, file=sys.stderr)
                message = bytearray()
                if re.match(r'\d\d\d\d', start_char4):
                    batch_type = 1
                    message_length = int(start_char4 + in_file_stream.read(4).decode())
                    try:
                        if message_length == 0:
                            break
                        format_identifier = int(in_file_stream.read(2).decode())
                        if format_identifier == 0:
                            in_file_stream.read(10) # skip
                            message_length -= 10
                        elif format_identifier == 1:
                            in_file_stream.read(3) # skip
                            message_length -= 3
                        else:
                            print('Warning', warno, ':', 'The format identifier of', in_file, 'is not 00 or 01.', file=sys.stderr)
                            break
                    except:
                        print('Warning', warno, ':', 'The bytes of message length on', in_file, 'are not strings.', file=sys.stderr)
                        break
                elif start_char4 == '####':
                    try:
                        batch_type = 2
                        in_file_stream.read(3) # skip '018'
                        message_length = int(in_file_stream.read(6).decode())
                        in_file_stream.read(5) # skip ####\n
                    except:
                        print('Warning', warno, ':', 'The bytes of message length on', in_file, 'are not strings.', file=sys.stderr)
                        break
                elif start_char4 == '****':
                    try:
                        batch_type = 3
                        message_length = int(in_file_stream.read(10).decode())
                        in_file_stream.read(5) # skip ****\n
                    except:
                        print('Warning', warno, ':', 'The bytes of message length on', in_file, 'are not strings.', file=sys.stderr)
                        break
                else:
                    try:
                        message.extend(start_char4.encode())
                        message.extend(in_file_stream.read())
                    except:
                        print('Warning', warno, ':', 'can not encode or read', in_file, file=sys.stderr)
                        break
                    out_file = create_file(in_file, my_cccc, message, start_char4, out_dir, tmp_grib_file, conf_list, debug)
                    if out_file:
                        out_file_checksum = getHash(out_file)
                        if len(checksum_df[checksum_df['checksum'] == out_file_checksum].index) == 0 and not out_file_checksum in checksum_list:
                            checksum_list.append(out_file_checksum)
                            print(out_file, file=out_list_file)
                        else:
                            os.remove(out_file)
                    break
                if message_length <= 0:
                    if debug:
                        print('Debug', ':', 'The message length of', in_file, 'is invalid. (<=0)', file=sys.stderr)
                    break
                if debug:
                    print('Debug', ':', 'batch_type =', batch_type, ', message_length =', message_length, file=sys.stderr)
                if batch_type == 1:
                    message = bytearray(in_file_stream.read(message_length))
                elif batch_type == 2 or batch_type == 3:
                    message = bytearray(in_file_stream.read(message_length))
                message_counter = len(message) - 1
                while message_counter > -1:
                    if message[message_counter] == 3 or message[message_counter] == 10 or message[message_counter] == 13 or message[message_counter] == 32:
                        message.pop(message_counter)
                    else:
                        break
                    message_counter -= 1
                message_counter = 0
                while message_counter < len(message):
                    if message[0] == 10 or message[0] == 13 or message[0] == 32:
                        message.pop(0)
                    else:
                        break
                    message_counter += 1
                out_file = create_file_from_batch(in_file, my_cccc, message, out_dir, tmp_grib_file, conf_list, debug)
                if out_file:
                    out_file_checksum = getHash(out_file)
                    if len(checksum_df[checksum_df['checksum'] == out_file_checksum].index) == 0 and not out_file_checksum in checksum_list:
                        checksum_list.append(out_file_checksum)
                        print(out_file, file=out_list_file)
                    else:
                        os.remove(out_file)
                try:
                    byte4 = in_file_stream.read(4)
                    if len(byte4) < 4:
                        break
                    start_char4 = byte4.decode()
                except:
                    start_char4 = None
                    print('Warning', warno, ':', 'The start 4 bytes of the message on', in_file, 'are not strings.', file=sys.stderr)
    if len(checksum_list) > 0:
        td = timedelta(days=1)
        new_checksum_df = pd.concat([checksum_df[(checksum_df['mtime'] >= now - td) & (checksum_df['mtime'] <= now + td)], pd.DataFrame({"mtime": [now] * len(checksum_list), "checksum": checksum_list})])
        with open(checksum_arrow_file, 'bw') as checksum_arrow_f:
            feather.write_feather(new_checksum_df, checksum_arrow_f, compression='zstd')
コード例 #38
0
ファイル: train.py プロジェクト: Repsay/BinanceAPI
    data = base_data.copy()

    for interval in intervals:
        print(interval.name)
        if interval == CandlestickInterval.minutes1:
            del dfs[interval]
            continue
        else:
            temp_df = dfs[interval][["CloseTime", "OpenPrice", "HighPrice", "LowPrice", "ClosePrice","Volume","NumberTrades"]]
            data = pd.merge_asof(data.sort_values("CloseTime"), temp_df.sort_values("CloseTime"), on="CloseTime", suffixes=(None, f'_{interval.name}'))
            del dfs[interval]
            del temp_df
    
    data = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(data.values))

    feather.write_feather(data, data_path)
idx = 0
first = True
data: pd.DataFrame = feather.read_feather(data_path).iloc[:-45000]
highestFit = float('-inf') #[float('-inf') for i in range(len(HIDDEN_LAYERS))]
generationCount = 0 #[0 for i in range(len(HIDDEN_LAYERS))]
print("Generating traders")
bestTrader = None #[None for i in range(len(HIDDEN_LAYERS))]
previousTraders: list[Trader] = []

while generationCount <= AMOUNTOFGENS:

    totalFitness = 0
    new = False
    allTraders: list[Trader] = []
    if first:
コード例 #39
0
    pool = mp.Pool()
    p_dict = {}
    for i in range(int((end - start) / step)):
        p_dict['p' + str(i)] = pool.apply_async(get_char_daily, (
            df['crsp%s' % i],
            df['firm%s' % i],
        ))
    pool.close()
    pool.join()
    result = pd.DataFrame()
    print('processing pd.concat')
    for h in range(int((end - start) / step)):
        result = pd.concat([result, p_dict['p%s' % h].get()])
    return result


# calculate variance of residual through rolling window
# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub
# dataframes here, so the function will use 20 cores to calculate variance of residual.
if __name__ == '__main__':
    crsp_out = main(0, 1, 0.05)

# process dataframe
crsp_out = crsp_out.dropna(subset=['rvar'])  # drop NA due to rolling
crsp_out = crsp_out.rename(columns={'rvar': 'rvar_capm'})
crsp_out = crsp_out.reset_index(drop=True)
crsp_out = crsp_out[['permno', 'date', 'rvar_capm']]

with open('rvar_capm_daily.feather', 'wb') as f:
    feather.write_feather(crsp_out, f)
コード例 #40
0
    reset_index()

pmnts.columns = [
    i[0] if i[1] == '' else '_'.join(i) for i in pmnts.columns.values
]

#join all tables
df = pd.merge(pmnts, num_inst_len, on=["SK_ID_CURR", "SK_ID_PREV"])
df = pd.merge(df, num_inst_ver, on=["SK_ID_CURR", "SK_ID_PREV"])

#aggregate by SK_ID_CURR
df = df.drop(['SK_ID_PREV'], axis=1)

df = df.groupby('SK_ID_CURR').\
    agg([np.mean, np.sum, np.std]).\
    reset_index()

df.columns = [
    i[0] if i[1] == '' else '_'.join(reversed(i)) for i in df.columns.values
]

#save as feather file
#feather.write_feather(df, 'instalment_payments')

suffix = '.feather'

filePath = os.path.join(os.getcwd(), '2_data_preparation', 'features',
                        table + '_features' + suffix)

feather.write_feather(df, filePath)