def get_records_sample(cls, obj: Any, n: int = 200) -> Optional[List[Dict]]: sample = cls.head(obj, n) if sample is None: return None return list(read_csv(sample))
def copy_delim_file_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, FileSystemStorageApi) assert isinstance(to_storage_api, PythonStorageApi) with from_storage_api.open(from_name) as f: records = list(read_csv(f.readlines())) mdr = as_records(records, data_format=RecordsFormat, schema=schema) mdr = mdr.conform_to_schema() to_storage_api.put(to_name, mdr)
def copy_file_object_iterator_to_records_iterator( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) itr = (read_csv(chunk) for chunk in with_header(mdr.records_object)) to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_file_object_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) obj = read_csv(mdr.records_object) to_mdr = as_records(obj, data_format=RecordsFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_file_object_to_records_iterator( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) # Note: must keep header on each chunk when iterating delimited file object! # TODO: ugly hard-coded 1000 here, but how could we ever make it configurable? Not a big deal I guess itr = (read_csv(chunk) for chunk in with_header(iterate_chunks(mdr.records_object, 1000))) to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def str_as_dataframe( test_data: str, module: Optional[SnapflowModule] = None, nominal_schema: Optional[Schema] = None, ) -> DataFrame: # TODO: add conform_dataframe_to_schema option if test_data.endswith(".csv"): if module is None: raise with module.open_module_file(test_data) as f: raw_records = list(read_csv(f.readlines())) elif test_data.endswith(".json"): if module is None: raise with module.open_module_file(test_data) as f: raw_records = [read_json(line) for line in f] else: # Raw str csv raw_records = read_raw_string_csv(test_data) if nominal_schema is None: auto_schema = infer_schema_from_records(raw_records) nominal_schema = auto_schema df = records_to_dataframe(raw_records, nominal_schema) return df
def test_records_to_file(): dr = tempfile.gettempdir() s: Storage = Storage.from_url(f"file://{dr}") fs_api: FileSystemStorageApi = s.get_api() mem_api: PythonStorageApi = new_local_python_storage().get_api() name = "_test" fmt = RecordsFormat obj = [{"f1": "hi", "f2": 2}] mdr = as_records(obj, data_format=fmt) mem_api.put(name, mdr) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, fmt), StorageFormat(s.storage_engine, DelimitedFileFormat), ) copy_records_to_delim_file.copy(name, name, conversion, mem_api, fs_api, schema=TestSchema4) with fs_api.open(name) as f: recs = list(read_csv(f)) recs = RecordsFormat.conform_records_to_schema(recs, TestSchema4) assert recs == obj