Exemple #1
0
def str_as_dataframe(
    env: Environment,
    test_data: str,
    package: Optional[DataFunctionPackage] = None,
    nominal_schema: Optional[Schema] = None,
) -> DataFrame:
    # TODO: add conform_dataframe_to_schema option
    # TODO: support files
    # if test_data.endswith(".csv"):
    #     if module is None:
    #         raise
    #     with module.open_module_file(test_data) as f:
    #         raw_records = list(read_csv(f.readlines()))
    # elif test_data.endswith(".json"):
    #     if module is None:
    #         raise
    #     with module.open_module_file(test_data) as f:
    #         raw_records = [read_json(line) for line in f]
    # else:
    # Raw str csv
    raw_records = list(read_raw_string_csv(test_data))
    tmp = "_test_obj_" + rand_str()
    env._local_python_storage.get_api().put(tmp, raw_records)
    if nominal_schema is None:
        auto_schema = infer_schema_for_name(tmp, env._local_python_storage)
        nominal_schema = auto_schema
    else:
        PythonRecordsHandler().cast_to_schema(tmp, env._local_python_storage,
                                              nominal_schema)
    df = DataFrame.from_records(raw_records)
    return df
Exemple #2
0
 def handle_python_object(self, obj: Any) -> Tuple[str, Storage]:
     name = "_tmp_obj_" + rand_str(10)
     if isinstance(obj, IOBase):
         # Handle file-like by writing to disk first
         file_storages = [
             s for s in self.execution_context.storages
             if s.storage_engine.storage_class == FileSystemStorageClass
         ]
         if not file_storages:
             raise Exception(
                 "File-like object returned but no file storage provided."
                 "Add a file storage to the environment: `env.add_storage('file:///....')`"
             )
         if self.execution_context.target_storage in file_storages:
             storage = self.execution_context.target_storage
         else:
             storage = file_storages[0]
         mode = "w"
         if isinstance(obj, (RawIOBase, BufferedIOBase)):
             mode = "wb"
         with storage.get_api().open(name, mode) as f:
             for s in obj:
                 f.write(s)
     else:
         storage = self.execution_context.local_storage
         storage.get_api().put(name, obj)
     return name, storage
Exemple #3
0
 def handle_emit(
     self,
     records_obj: Any = None,
     name: str = None,
     storage: Storage = None,
     output: str = DEFAULT_OUTPUT_NAME,
     data_format: DataFormat = None,
     schema: SchemaLike = None,
 ):
     logger.debug(
         f"HANDLING EMITTED OBJECT (of type '{type(records_obj).__name__}')"
     )
     # TODO: can i return an existing DataBlock? Or do I need to create a "clone"?
     #   Answer: ok to return as is (just mark it as 'output' in DBL)
     if isinstance(records_obj, StoredDataBlockMetadata):
         # TODO is it in local storage tho? we skip conversion below...
         # This is just special case right now to support SQL snap
         # Will need better solution for explicitly creating DB/SDBs inside of snaps
         return records_obj
     elif isinstance(records_obj, DataBlockMetadata):
         raise NotImplementedError
     elif isinstance(records_obj, ManagedDataBlock):
         raise NotImplementedError
     nominal_output_schema = schema
     if nominal_output_schema is None:
         nominal_output_schema = self.bound_interface.resolve_nominal_output_schema(
             self.env
         )  # TODO: could check output to see if it is LocalRecords with a schema too?
     if nominal_output_schema is not None:
         nominal_output_schema = self.env.get_schema(nominal_output_schema)
     sdb = self.get_stored_datablock_for_output(output)
     sdb.data_format = data_format
     db = sdb.data_block
     if db.nominal_schema_key and db.nominal_schema_key != nominal_output_schema.key:
         raise Exception(
             "Mismatch nominal schemas {db.nominal_schema_key} - {nominal_output_schema.key}"
         )
     db.nominal_schema_key = nominal_output_schema.key
     if records_obj is not None:
         name = "_tmp_obj_" + rand_str(10)
         storage = self.execution_context.local_storage
         storage.get_api().put(name, records_obj)
         if nominal_output_schema is not None:
             # TODO: still unclear on when and why to do this cast
             handler = get_handler_for_name(name, storage)
             handler().cast_to_schema(name, storage, nominal_output_schema)
     sdb.storage_url = storage.url
     assert name is not None
     assert storage is not None
     self.append_records_to_stored_datablock(name, storage, sdb)
     return sdb
Exemple #4
0
 def as_tmp_local_object(self, obj: Any) -> str:
     tmp_name = "_tmp_obj_" + rand_str()
     self.execution_context.local_storage.get_api().put(tmp_name, obj)
     yield tmp_name
     self.execution_context.local_storage.get_api().remove(tmp_name)
Exemple #5
0
Fichier : utils.py Projet : kvh/dcp
def get_tmp_sqlite_db_url(dbname=None):
    if dbname is None:
        dbname = rand_str(10)
    dir = tempfile.mkdtemp()
    return f"sqlite:///{dir}/{dbname}.db"