def test_overwritten_file(self): path = random_path() num_values = 100 np.random.seed(0) values = np.random.randint(0, 10, size=num_values) write_feather(pd.DataFrame({'ints': values}), path) df = pd.DataFrame({'ints': values[0: num_values//2]}) self._check_pandas_roundtrip(df, path=path)
def test_filelike_objects(self): from io import BytesIO buf = BytesIO() # the copy makes it non-strided df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=['a', 'b', 'c']).copy() write_feather(df, buf) buf.seek(0) result = read_feather(buf) assert_frame_equal(result, df)
def test_chunked_binary_error_message(): # ARROW-3058: As Feather does not yet support chunked columns, we at least # make sure it's clear to the user what is going on # 2^31 + 1 bytes values = [b'x'] + [ b'x' * (1 << 20) ] * 2 * (1 << 10) df = pd.DataFrame({'byte_col': values}) with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum " "capacity of a Feather binary column. This restriction " "may be lifted in the future"): write_feather(df, io.BytesIO())
def test_delete_partial_file_on_error(self): # strings will fail df = pd.DataFrame( { 'numbers': range(5), 'strings': [b'foo', None, u'bar', 'qux', np.nan]}, columns=['numbers', 'strings']) path = random_path() try: write_feather(df, path) except: pass assert not os.path.exists(path)
def test_num_columns_attr(self): df0 = pd.DataFrame({}) df1 = pd.DataFrame({ 'foo': [1, 2, 3, 4, 5] }) df2 = pd.DataFrame({ 'foo': [1, 2, 3, 4, 5], 'bar': [1, 2, 3, 4, 5] }) for df, ncols in zip([df0, df1, df2], [0, 1, 2]): path = random_path() self.test_files.append(path) write_feather(df, path) reader = FeatherReader(path) assert reader.num_columns == ncols
def test_num_rows_attr(self): df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]}) path = random_path() self.test_files.append(path) write_feather(df, path) reader = FeatherReader(path) assert reader.num_rows == len(df) df = pd.DataFrame({}) path = random_path() self.test_files.append(path) write_feather(df, path) reader = FeatherReader(path) assert reader.num_rows == 0
def to_feather(df, path): """ Write a DataFrame to the feather-format Parameters ---------- df : DataFrame path : string file path, or file-like object """ path = _stringify_path(path) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") feather = _try_import()[0] valid_types = {'string', 'unicode'} # validate index # -------------- # validate that we have only a default index # raise on anything else as we don't serialize the index if not isinstance(df.index, Int64Index): raise ValueError("feather does not support serializing {} " "for the index; you can .reset_index()" "to make the index into column(s)".format( type(df.index))) if not df.index.equals(RangeIndex.from_range(range(len(df)))): raise ValueError("feather does not support serializing a " "non-default index for the index; you " "can .reset_index() to make the index " "into column(s)") if df.index.name is not None: raise ValueError("feather does not serialize index meta-data on a " "default index") # validate columns # ---------------- # must have value column names (strings only) if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") feather.write_feather(df, path)
def test_delete_partial_file_on_error(self): if sys.platform == 'win32': pytest.skip('Windows hangs on to file handle for some reason') # strings will fail df = pd.DataFrame( { 'numbers': range(5), 'strings': [b'foo', None, u'bar', 'qux', np.nan]}, columns=['numbers', 'strings']) path = random_path() try: write_feather(df, path) except Exception: pass assert not os.path.exists(path)
def test_boolean_nulls(version): # pandas requires upcast to object dtype path = random_path() TEST_FILES.append(path) num_values = 100 np.random.seed(0) mask = np.random.randint(0, 10, size=num_values) < 3 values = np.random.randint(0, 10, size=num_values) < 5 table = pa.table([pa.array(values, mask=mask)], names=['bools']) write_feather(table, path, version=version) expected = values.astype(object) expected[mask] = None ex_frame = pd.DataFrame({'bools': expected}) result = read_feather(path) assert_frame_equal(result, ex_frame)
def test_delete_partial_file_on_error(self): if sys.platform == 'win32': pytest.skip('Windows hangs on to file handle for some reason') class CustomClass(object): pass # strings will fail df = pd.DataFrame( { 'numbers': range(5), 'strings': [b'foo', None, u'bar', CustomClass(), np.nan]}, columns=['numbers', 'strings']) path = random_path() try: write_feather(df, path) except Exception: pass assert not os.path.exists(path)
def test_read_table(version): num_values = (100, 100) path = random_path() TEST_FILES.append(path) values = np.random.randint(0, 100, size=num_values) columns = ['col_' + str(i) for i in range(100)] table = pa.Table.from_arrays(values, columns) write_feather(table, path, version=version) result = read_table(path) assert result.equals(table) # Test without memory mapping result = read_table(path, memory_map=False) assert result.equals(table) result = read_feather(path, memory_map=False) assert_frame_equal(table.to_pandas(), result)
def test_dataset(version): num_values = (100, 100) num_files = 5 paths = [random_path() for i in range(num_files)] data = { "col_" + str(i): np.random.randn(num_values[0]) for i in range(num_values[1]) } table = pa.table(data) TEST_FILES.extend(paths) for index, path in enumerate(paths): rows = ( index * (num_values[0] // num_files), (index + 1) * (num_values[0] // num_files), ) write_feather(table[rows[0]: rows[1]], path, version=version) data = FeatherDataset(paths).read_table() assert data.equals(table)
def _check_pandas_roundtrip(self, df, expected=None, path=None, columns=None, null_counts=None): if path is None: path = random_path() self.test_files.append(path) write_feather(df, path) if not os.path.exists(path): raise Exception('file not written') result = read_feather(path, columns) if expected is None: expected = df assert_frame_equal(result, expected) if null_counts is None: null_counts = np.zeros(len(expected.columns)) np.testing.assert_array_equal(self._get_null_counts(path, columns), null_counts)
def root2disk(fileName, chunkSize=1000000): """ A function to convert input ROOT file into parquet and feather formats for later faster input/output from disk. Parameters ---------- - fileName : string Specifies location and name of the file - chunkSize : int A part of the whole sample that is read and processed at a time. Raises ------ - Nothing. Used to have TypeError if no parameters are given. Returns ------- - Void """ if not isinstance(fileName, str) and not isinstance(chunkSize, int): raise TypeError( "Please specify both fileName and chunkSize parameters! Exiting.") #pbar = ProgressBar() count = 1 #for df in pbar(read_root(filename, chunksize=oneMillion)): print 'Processing >>' for df in read_root(paths=fileName, chunksize=chunkSize): print '>>' * count feather.write_feather(df, 'ndf_{0}.feather'.format(count)) df.to_parquet('ndf_{0}.parquet'.format(count), engine='fastparquet', compression='gzip') count += 1 if count > 100: break
def export_file(obj, filename, extension, flag=None): """Export a valid object to file""" if extension == ".gri" and isinstance(obj, xtgeo.RegularSurface): obj.to_file(filename, fformat="irap_binary") elif extension == ".csv" and isinstance(obj, (xtgeo.Polygons, xtgeo.Points)): out = obj.copy() # to not modify incoming instance! if "xtgeo" not in flag: out.xname = "X" out.yname = "Y" out.zname = "Z" if isinstance(out, xtgeo.Polygons): # out.pname = "ID" not working out.dataframe.rename(columns={out.pname: "ID"}, inplace=True) out.dataframe.to_csv(filename, index=False) elif extension == ".pol" and isinstance(obj, (xtgeo.Polygons, xtgeo.Points)): obj.to_file(filename) elif extension == ".segy" and isinstance(obj, xtgeo.Cube): obj.to_file(filename, fformat="segy") elif extension == ".roff" and isinstance(obj, (xtgeo.Grid, xtgeo.GridProperty)): obj.to_file(filename, fformat="roff") elif extension == ".csv" and isinstance(obj, pd.DataFrame): includeindex = True if flag == "include_index" else False obj.to_csv(filename, index=includeindex) elif extension == ".arrow" and HAS_PYARROW and isinstance(obj, pa.Table): # comment taken from equinor/webviz_subsurface/smry2arrow.py # Writing here is done through the feather import, but could also be done using # pa.RecordBatchFileWriter.write_table() with a few pa.ipc.IpcWriteOptions(). It # is convenient to use feather since it has ready configured defaults and the # actual file format is the same # (https://arrow.apache.org/docs/python/feather.html) feather.write_feather(obj, dest=filename) else: raise TypeError( f"Exporting {extension} for {type(obj)} is not supported") return str(filename)
def test_v2_compression_options(): df = pd.DataFrame({'A': np.arange(1000)}) cases = [ # compression, compression_level ('uncompressed', None), ('lz4', None), ('lz4', 1), ('lz4', 12), ('zstd', 1), ('zstd', 10) ] for compression, compression_level in cases: _check_pandas_roundtrip(df, compression=compression, compression_level=compression_level) buf = io.BytesIO() # Trying to compress with V1 with pytest.raises( ValueError, match="Feather V1 files do not support compression option"): write_feather(df, buf, compression='lz4', version=1) # Trying to set chunksize with V1 with pytest.raises( ValueError, match="Feather V1 files do not support chunksize option"): write_feather(df, buf, chunksize=4096, version=1) # Unsupported compressor with pytest.raises(ValueError, match='compression="snappy" not supported'): write_feather(df, buf, compression='snappy')
def test_read_table(version): num_values = (100, 100) path = random_path() TEST_FILES.append(path) values = np.random.randint(0, 100, size=num_values) df = pd.DataFrame(values, columns=['col_' + str(i) for i in range(100)]) write_feather(df, path, version=version) data = pd.DataFrame(values, columns=['col_' + str(i) for i in range(100)]) table = pa.Table.from_pandas(data) result = read_table(path) assert_frame_equal(table.to_pandas(), result.to_pandas()) # Test without memory mapping result = read_table(path, memory_map=False) assert_frame_equal(table.to_pandas(), result.to_pandas())
def test_arrow_feather_dataset(self): """test_arrow_feather_dataset""" # Feather files currently do not support columns of list types truth_data = TruthData(self.scalar_data, self.scalar_dtypes, self.scalar_shapes) batch = self.make_record_batch(truth_data) df = batch.to_pandas() # Create a tempfile that is deleted after tests run with tempfile.NamedTemporaryFile(delete=False) as f: write_feather(df, f) # test single file dataset = arrow_io.ArrowFeatherDataset( f.name, list(range(len(truth_data.output_types))), truth_data.output_types, truth_data.output_shapes) self.run_test_case(dataset, truth_data) # test multiple files dataset = arrow_io.ArrowFeatherDataset( [f.name, f.name], list(range(len(truth_data.output_types))), truth_data.output_types, truth_data.output_shapes) truth_data_doubled = TruthData( [d * 2 for d in truth_data.data], truth_data.output_types, truth_data.output_shapes) self.run_test_case(dataset, truth_data_doubled) # test construction from schema dataset = arrow_io.ArrowFeatherDataset.from_schema( f.name, batch.schema) self.run_test_case(dataset, truth_data) os.unlink(f.name)
def test_feather_chunked(self): from pyarrow.feather import write_feather x = np.arange(10).reshape(5, 2) s = TensorArray(x) df1 = pd.DataFrame({"i": list(range(len(s))), "tensor": s}) # Create a Table with 2 chunks table1 = pa.Table.from_pandas(df1) df2 = df1.copy() df2["tensor"] = df2["tensor"] * 10 table2 = pa.Table.from_pandas(df2) table = pa.concat_tables([table1, table2]) self.assertEqual(table.column("tensor").num_chunks, 2) # Write table to feather and read back as a DataFrame with tempfile.TemporaryDirectory() as dirpath: filename = os.path.join(dirpath, "tensor_array_chunked_test.feather") write_feather(table, filename) df_read = pd.read_feather(filename) df_expected = pd.concat([df1, df2]).reset_index(drop=True) pd.testing.assert_frame_equal(df_expected, df_read)
def save_df_s3(df, bucket_name, filepath, filetype='feather'): """ Save df to filepath on the S3 bucket in the specified format. Supported formats: feather, pickle. Parameters: df: pandas dataframe to be saved bucket_name: name of the S3 bucket filepath: path of the saved location on S3, including the filename (relative to the S3 bucket's home dir) Example: Save the df 'allmysecrets' as a feather file 'topsecret.feather' save_df_s3(df=allmysecrets, bucket_name='bucketymcbucket', 'data/topsecret.feather') """ s3_resource = boto3.resource('s3') with BytesIO() as f: if filetype == 'feather': write_feather(df, f) s3_resource.Object(bucket_name, filepath).put(Body=f.getvalue()) elif filetype == 'pickle': pickle.dump(df, f) s3_resource.Object(bucket_name, filepath).put(Body=f.getvalue())
def _check_pandas_roundtrip(self, df, expected=None, path=None, columns=None, null_counts=None, nthreads=1): if path is None: path = random_path() self.test_files.append(path) write_feather(df, path) if not os.path.exists(path): raise Exception('file not written') result = read_feather(path, columns, nthreads=nthreads) if expected is None: expected = df assert_frame_equal(result, expected) if null_counts is None: null_counts = np.zeros(len(expected.columns)) np.testing.assert_array_equal(self._get_null_counts(path, columns), null_counts)
def _check_pandas_roundtrip(df, expected=None, path=None, columns=None, use_threads=False, version=None, compression=None, compression_level=None): if path is None: path = random_path() if version is None: version = 2 TEST_FILES.append(path) write_feather(df, path, compression=compression, compression_level=compression_level, version=version) if not os.path.exists(path): raise Exception('file not written') result = read_feather(path, columns, use_threads=use_threads) if expected is None: expected = df assert_frame_equal(result, expected)
def test_use_threads(version): # ARROW-14470 num_values = (10, 10) path = random_path() TEST_FILES.append(path) values = np.random.randint(0, 10, size=num_values) columns = ['col_' + str(i) for i in range(10)] table = pa.Table.from_arrays(values, columns) write_feather(table, path, version=version) result = read_feather(path) assert_frame_equal(table.to_pandas(), result) # Test read_feather with use_threads=False result = read_feather(path, use_threads=False) assert_frame_equal(table.to_pandas(), result) # Test read_table with use_threads=False result = read_table(path, use_threads=False) assert result.equals(table)
def test_arrow_list_feather_columns(self): """test_arrow_list_feather_columns""" import tensorflow_io.arrow as arrow_io from pyarrow.feather import write_feather # Feather files currently do not support columns of list types truth_data = TruthData(self.scalar_data, self.scalar_dtypes, self.scalar_shapes) batch = self.make_record_batch(truth_data) df = batch.to_pandas() # Create a tempfile that is deleted after tests run with tempfile.NamedTemporaryFile(delete=False) as f: write_feather(df, f, version=1) # test single file # prefix "file://" to test scheme file system (e.g., s3, gcs, azfs, ignite) columns = arrow_io.list_feather_columns("file://" + f.name) for name, dtype in list(zip(batch.schema.names, batch.schema.types)): assert columns[name].name == name assert columns[name].dtype == dtype assert columns[name].shape == [4] # test memory with open(f.name, "rb") as ff: memory = ff.read() # when memory is provided filename doesn't matter: columns = arrow_io.list_feather_columns("file:///non_exist", memory=memory) for name, dtype in list(zip(batch.schema.names, batch.schema.types)): assert columns[name].name == name assert columns[name].dtype == dtype assert columns[name].shape == [4] os.unlink(f.name)
def convert2feather(fname: str, out_folder: str, name: str, extension: str = "feather") -> str: """ Convert a whole genome rankings database to a feather format based database. More information on this format can be found here: .. feather-format: https://blog.rstudio.com/2016/03/29/feather/ :param fname: The filename of the legacy :param out_folder: The name of the folder to write the new database to. :param name: The name of the rankings database. :param extension: The extension of the new database file. :return: The filename of the new database. """ assert os.path.isfile(fname), "{} does not exist.".format(fname) assert os.path.isdir(out_folder), "{} is not a directory.".format( out_folder) feather_fname = os.path.join( out_folder, "{}.{}".format( os.path.splitext(os.path.basename(fname))[0], extension)) assert not os.path.exists(feather_fname), "{} already exists.".format( feather_fname) # Load original database into memory. # Caveat: the original storage format of whole genome rankings does not store the metadata, i.e. name. db = SQLiteRankingDatabase(fname=fname, name=name) df = db.load_full() df.index.name = INDEX_NAME df.reset_index( inplace=True ) # Index is not stored in feather format. https://github.com/wesm/feather/issues/200 write_feather(df, feather_fname) return feather_fname
def test_feather_format(tempdir): from pyarrow.feather import write_feather table = pa.table({ 'a': pa.array([1, 2, 3], type="int8"), 'b': pa.array([.1, .2, .3], type="float64") }) basedir = tempdir / "feather_dataset" basedir.mkdir() write_feather(table, str(basedir / "data.feather")) dataset = ds.dataset(basedir, format=ds.IpcFileFormat()) result = dataset.to_table() assert result.equals(table) dataset = ds.dataset(basedir, format="feather") result = dataset.to_table() assert result.equals(table) # error with Feather v1 files write_feather(table, str(basedir / "data1.feather"), version=1) with pytest.raises(ValueError): ds.dataset(basedir, format="feather").to_table()
def count_ligs(chunk): print(type(chunk)) print(chunk.schema) table_batch = chunk smiles = list(table_batch.column('smiles')) names = list(table_batch.column('zinc_id')) fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect pars = { "radius": 2, "nBits": 8192, "invariants": [], "fromAtoms": [], "useChirality": False, "useBondTypes": True, "useFeatures": True, } fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect print(f'the number of smiles in the record batch is {len(smiles)}') count_ligs = len(smiles) smiles = [x for x in smiles] named_mols = [] for count, m in enumerate(smiles): mol = Chem.MolFromSmiles(str(m)) molid = names[count] mol.SetProp('_Name', str(molid)) named_mols.append(mol) record_batches = gen_fp(named_mols, pars, fingerprint_function) tab = gen_fp(named_mols, pars, fingerprint_function) target_directory = '/data/dockop_data/feathers_zinc_15' name = os.path.join(target_directory, 'zinc_15_subset%s' + '.feather') unique_feather = next_path(name) with open(unique_feather, 'wb') as f: feather.write_feather(tab, f) print(f'The feather was written to {unique_feather}') return unique_feather
def storeModel(self): if os.path.isdir(self.brain_path): shutil.rmtree(self.brain_path) os.makedirs(self.brain_path) file_path_nodes = os.path.join(self.brain_path, 'hidden.feather') for i in range(len(self.weights)): file_path_weights = os.path.join(self.brain_path, f'weights{i}.feather') file_path_biases = os.path.join(self.brain_path, f'biases{i}.feather') feather.write_feather(pd.DataFrame(self.weights[i]), file_path_weights) feather.write_feather(pd.DataFrame(self.biases[i]), file_path_biases) feather.write_feather(pd.DataFrame(self.hidden_nodes), file_path_nodes)
def bench_write(self, niter=2): print("Reading text file: {}".format(self.csv_path)) df = pd.read_csv(self.csv_path, sep=self.sep, header=self.header, low_memory=False) if self.header is None: df.columns = ['f{}'.format(i) for i in range(len(df.columns))] def _get_table(df): return (pa.Table.from_pandas( df, preserve_index=False).replace_schema_metadata(None)) t = _get_table(df) cases = [ ('parquet (UNC)', 'arrow Table', lambda: pq.write_table( t, self.parquet_unc_path, compression='NONE')), ('parquet (UNC)', 'pandas', lambda: pq.write_table( _get_table(df), self.parquet_unc_path, compression='NONE')), ('parquet (SNAPPY)', 'arrow Table', lambda: pq.write_table(t, self.parquet_snappy_path)), ('parquet (SNAPPY)', 'pandas', lambda: pq.write_table(_get_table(df), self.parquet_snappy_path)), ('feather V2 (UNC)', 'pandas', lambda: feather.write_feather( df, self.feather_unc_path, compression='uncompressed')), ('feather V2 (UNC)', 'arrow Table', lambda: feather.write_feather( t, self.feather_unc_path, compression='uncompressed')), ('feather V2 (LZ4)', 'pandas', lambda: feather.write_feather( df, self.feather_lz4_path, compression='lz4')), ('feather V2 (LZ4)', 'arrow Table', lambda: feather.write_feather( t, self.feather_lz4_path, compression='lz4')), ('feather V2 (ZSTD)', 'pandas', lambda: feather.write_feather( df, self.feather_zstd_path, compression='zstd')), ('feather V2 (ZSTD)', 'arrow Table', lambda: feather.write_feather( t, self.feather_zstd_path, compression='zstd')) ] return self._bench_cases(cases, niter)
def to_feather(self,expandCategory,expandTime,preprocessType,seperateLabels,chunksize): """ to_feather transform Time_Series_Data or Time_Series_Data_Collection to feather file Parameters ---------- expandCategory : bool whether to expand category expandTime : bool whether to expand time preprocessType : ['ignore','pad','remove'] preprocess data time across categories seperateLabels : bool whether to seperate labels and data chunksize : int size of feather file """ if seperateLabels ==False: table = to_arrow_table( time_series = self.time_series, expandCategory = expandCategory, expandTime= expandTime, preprocessType = preprocessType, seperateLabels = seperateLabels ) pf.write_feather(table,self.dirPaths,version = self.version,chunksize=chunksize) return table, label_table = to_arrow_table( time_series = self.time_series, expandCategory = expandCategory, expandTime= expandTime, preprocessType = preprocessType, seperateLabels = seperateLabels ) pf.write_feather(table,self.dirPaths[0],version = self.version,chunksize=chunksize) pf.write_feather(label_table,self.dirPaths[1],version = self.version,chunksize=chunksize)
def f(): write_feather(df, path)
indices = duplicateRowsDF.index names = duplicateRowsDF['names'] dupenum = 2 for count,indexid in enumerate(indices): print(df.loc[indexid,'names']) new_value = f'{names[indexid]}_{str(dupenum)}' print(new_value) df.at[indexid,'names'] = new_value return df molchunk_path = '/data/dopamine_3_results/mol_chunk_docking/mol_chunks_test_firstreal_first400.molchunk' autodock_gpu = '/home/schrogpu/ADFRsuite-1.0/AutoDock-GPU/bin/autodock_gpu_128wi' receptor_path = '/home/schrogpu/ADFRsuite-1.0/pocket2_fixer_moreatoms/rigidReceptor.maps.fld' lsmet = 'sw' num_runs = 50 col_to_dock = 'pdbqt_block_am1bcc' # col_to_dock = 'pdbqt_gast_list' working_dir = '/data/dopamine_3_results/mol_chunk_docking/first400d3' df = df_from_molchunk(molchunk_path) df = remove_duplicates(df) mols_to_pymol(df) outdf = run_autodock_gpu(df, col_to_dock, autodock_gpu, lsmet, num_runs, working_dir, receptor_path) indexed_df = index_docking_output(outdf) final_df = extract_specific_pdbqt(indexed_df, num_runs, working_dir) show_docked(final_df) del final_df['ROMol'] feather.write_feather(final_df, '/data/dopamine_3_results/mol_chunk_docking/mol_chunks_test_firstreal_first400_out.molchunk')
def to_feather( df: DataFrame, path: FilePathOrBuffer[AnyStr], storage_options: StorageOptions = None, **kwargs, ): """ Write a DataFrame to the binary Feather format. Parameters ---------- df : DataFrame path : string file path, or file-like object storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values. .. versionadded:: 1.2.0 **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. .. versionadded:: 1.1.0 """ import_optional_dependency("pyarrow") from pyarrow import feather ioargs = get_filepath_or_buffer(path, mode="wb", storage_options=storage_options) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") valid_types = {"string", "unicode"} # validate index # -------------- # validate that we have only a default index # raise on anything else as we don't serialize the index if not isinstance(df.index, Int64Index): typ = type(df.index) raise ValueError( f"feather does not support serializing {typ} " "for the index; you can .reset_index() to make the index into column(s)" ) if not df.index.equals(RangeIndex.from_range(range(len(df)))): raise ValueError( "feather does not support serializing a non-default index for the index; " "you can .reset_index() to make the index into column(s)" ) if df.index.name is not None: raise ValueError( "feather does not serialize index meta-data on a default index" ) # validate columns # ---------------- # must have value column names (strings only) if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") feather.write_feather(df, ioargs.filepath_or_buffer, **kwargs) if ioargs.should_close: assert not isinstance(ioargs.filepath_or_buffer, str) ioargs.filepath_or_buffer.close()
def f(): write_feather(df, path)
# call sentiment function sentiment_over_time(0,lang,date_range_input,file,subfolder) # load the created csv with the missing data df_final = pd.read_csv(r"C:\Users\simon\Desktop\WS_20_21\DS_12\test.csv") # reset index df_final = df_final.reset_index() # rename columns df_final = df_final.rename(columns={"level_0":"language","level_1":"retweet_count", "level_2":"date","level_3":"company","level_4":"sentiment_mean", "level_5":"sentiment_weight_retweet","level_6":"sentiment_weight_length", "Unnamed: 0":"sentiment_weight_likes"}) # append dataframe to the original dataframe df_full = df_full.append(df_final) # sort values df_full = df_full.sort_values(by=['company','language','retweet_count']) # write a feather file with the complete dataframe feather.write_feather(df_full, r"C:\Users\simon\Desktop\WS_20_21\DS_12\test_feather")
def f(): write_feather(df, path, version=version)
def convert_to_cache(my_cccc, input_file_list, out_dir, checksum_arrow_file, out_list_file, tmp_grib_file, conf_list, debug): warno = 189 checksum_df = feather.read_feather(checksum_arrow_file) checksum_list = [] now = datetime.utcnow() for in_file in input_file_list: if debug: print('Debug', ':', 'in_file =', in_file, file=sys.stderr) with open(in_file, 'rb') as in_file_stream: batch_type = 0 message_length = 0 start_byte4 = None start_char4 = None try: start_byte4 = in_file_stream.read(4) if len(start_byte4) < 4: break start_char4 = start_byte4.decode() except: print('Warning', warno, ':', 'The start 4 bytes of', in_file, 'are not strings.', file=sys.stderr) while start_char4: if debug: print('Debug', ':', 'start_char4 =', start_char4, file=sys.stderr) message = bytearray() if re.match(r'\d\d\d\d', start_char4): batch_type = 1 message_length = int(start_char4 + in_file_stream.read(4).decode()) try: if message_length == 0: break format_identifier = int(in_file_stream.read(2).decode()) if format_identifier == 0: in_file_stream.read(10) # skip message_length -= 10 elif format_identifier == 1: in_file_stream.read(3) # skip message_length -= 3 else: print('Warning', warno, ':', 'The format identifier of', in_file, 'is not 00 or 01.', file=sys.stderr) break except: print('Warning', warno, ':', 'The bytes of message length on', in_file, 'are not strings.', file=sys.stderr) break elif start_char4 == '####': try: batch_type = 2 in_file_stream.read(3) # skip '018' message_length = int(in_file_stream.read(6).decode()) in_file_stream.read(5) # skip ####\n except: print('Warning', warno, ':', 'The bytes of message length on', in_file, 'are not strings.', file=sys.stderr) break elif start_char4 == '****': try: batch_type = 3 message_length = int(in_file_stream.read(10).decode()) in_file_stream.read(5) # skip ****\n except: print('Warning', warno, ':', 'The bytes of message length on', in_file, 'are not strings.', file=sys.stderr) break else: try: message.extend(start_char4.encode()) message.extend(in_file_stream.read()) except: print('Warning', warno, ':', 'can not encode or read', in_file, file=sys.stderr) break out_file = create_file(in_file, my_cccc, message, start_char4, out_dir, tmp_grib_file, conf_list, debug) if out_file: out_file_checksum = getHash(out_file) if len(checksum_df[checksum_df['checksum'] == out_file_checksum].index) == 0 and not out_file_checksum in checksum_list: checksum_list.append(out_file_checksum) print(out_file, file=out_list_file) else: os.remove(out_file) break if message_length <= 0: if debug: print('Debug', ':', 'The message length of', in_file, 'is invalid. (<=0)', file=sys.stderr) break if debug: print('Debug', ':', 'batch_type =', batch_type, ', message_length =', message_length, file=sys.stderr) if batch_type == 1: message = bytearray(in_file_stream.read(message_length)) elif batch_type == 2 or batch_type == 3: message = bytearray(in_file_stream.read(message_length)) message_counter = len(message) - 1 while message_counter > -1: if message[message_counter] == 3 or message[message_counter] == 10 or message[message_counter] == 13 or message[message_counter] == 32: message.pop(message_counter) else: break message_counter -= 1 message_counter = 0 while message_counter < len(message): if message[0] == 10 or message[0] == 13 or message[0] == 32: message.pop(0) else: break message_counter += 1 out_file = create_file_from_batch(in_file, my_cccc, message, out_dir, tmp_grib_file, conf_list, debug) if out_file: out_file_checksum = getHash(out_file) if len(checksum_df[checksum_df['checksum'] == out_file_checksum].index) == 0 and not out_file_checksum in checksum_list: checksum_list.append(out_file_checksum) print(out_file, file=out_list_file) else: os.remove(out_file) try: byte4 = in_file_stream.read(4) if len(byte4) < 4: break start_char4 = byte4.decode() except: start_char4 = None print('Warning', warno, ':', 'The start 4 bytes of the message on', in_file, 'are not strings.', file=sys.stderr) if len(checksum_list) > 0: td = timedelta(days=1) new_checksum_df = pd.concat([checksum_df[(checksum_df['mtime'] >= now - td) & (checksum_df['mtime'] <= now + td)], pd.DataFrame({"mtime": [now] * len(checksum_list), "checksum": checksum_list})]) with open(checksum_arrow_file, 'bw') as checksum_arrow_f: feather.write_feather(new_checksum_df, checksum_arrow_f, compression='zstd')
data = base_data.copy() for interval in intervals: print(interval.name) if interval == CandlestickInterval.minutes1: del dfs[interval] continue else: temp_df = dfs[interval][["CloseTime", "OpenPrice", "HighPrice", "LowPrice", "ClosePrice","Volume","NumberTrades"]] data = pd.merge_asof(data.sort_values("CloseTime"), temp_df.sort_values("CloseTime"), on="CloseTime", suffixes=(None, f'_{interval.name}')) del dfs[interval] del temp_df data = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(data.values)) feather.write_feather(data, data_path) idx = 0 first = True data: pd.DataFrame = feather.read_feather(data_path).iloc[:-45000] highestFit = float('-inf') #[float('-inf') for i in range(len(HIDDEN_LAYERS))] generationCount = 0 #[0 for i in range(len(HIDDEN_LAYERS))] print("Generating traders") bestTrader = None #[None for i in range(len(HIDDEN_LAYERS))] previousTraders: list[Trader] = [] while generationCount <= AMOUNTOFGENS: totalFitness = 0 new = False allTraders: list[Trader] = [] if first:
pool = mp.Pool() p_dict = {} for i in range(int((end - start) / step)): p_dict['p' + str(i)] = pool.apply_async(get_char_daily, ( df['crsp%s' % i], df['firm%s' % i], )) pool.close() pool.join() result = pd.DataFrame() print('processing pd.concat') for h in range(int((end - start) / step)): result = pd.concat([result, p_dict['p%s' % h].get()]) return result # calculate variance of residual through rolling window # Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub # dataframes here, so the function will use 20 cores to calculate variance of residual. if __name__ == '__main__': crsp_out = main(0, 1, 0.05) # process dataframe crsp_out = crsp_out.dropna(subset=['rvar']) # drop NA due to rolling crsp_out = crsp_out.rename(columns={'rvar': 'rvar_capm'}) crsp_out = crsp_out.reset_index(drop=True) crsp_out = crsp_out[['permno', 'date', 'rvar_capm']] with open('rvar_capm_daily.feather', 'wb') as f: feather.write_feather(crsp_out, f)
reset_index() pmnts.columns = [ i[0] if i[1] == '' else '_'.join(i) for i in pmnts.columns.values ] #join all tables df = pd.merge(pmnts, num_inst_len, on=["SK_ID_CURR", "SK_ID_PREV"]) df = pd.merge(df, num_inst_ver, on=["SK_ID_CURR", "SK_ID_PREV"]) #aggregate by SK_ID_CURR df = df.drop(['SK_ID_PREV'], axis=1) df = df.groupby('SK_ID_CURR').\ agg([np.mean, np.sum, np.std]).\ reset_index() df.columns = [ i[0] if i[1] == '' else '_'.join(reversed(i)) for i in df.columns.values ] #save as feather file #feather.write_feather(df, 'instalment_payments') suffix = '.feather' filePath = os.path.join(os.getcwd(), '2_data_preparation', 'features', table + '_features' + suffix) feather.write_feather(df, filePath)